7 files changed, 134 insertions, 143 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e490271acf..17249fae501 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -734,7 +734,6 @@ struct sched_domain {
 	unsigned long max_interval;	/* Maximum balance interval ms */
 	unsigned int busy_factor;	/* less balancing by factor if busy */
 	unsigned int imbalance_pct;	/* No balance until over watermark */
-	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
 	unsigned int busy_idx;
 	unsigned int idle_idx;
@@ -875,7 +874,7 @@ struct sched_class {
 
 	void (*set_curr_task) (struct rq *rq);
 	void (*task_tick) (struct rq *rq, struct task_struct *p);
-	void (*task_new) (struct rq *rq, struct task_struct *p);
+	void (*task_new) (struct rq *rq, struct task_struct *p, u64 now);
 };
 
 struct load_weight {
@@ -905,23 +904,28 @@ struct sched_entity {
 	struct rb_node		run_node;
 	unsigned int		on_rq;
 
+	u64			exec_start;
+	u64			sum_exec_runtime;
 	u64			wait_start_fair;
+	u64			sleep_start_fair;
+
+#ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
-	u64			exec_start;
+	u64			wait_max;
+	s64			sum_wait_runtime;
+
 	u64			sleep_start;
-	u64			sleep_start_fair;
-	u64			block_start;
 	u64			sleep_max;
+	s64			sum_sleep_runtime;
+
+	u64			block_start;
 	u64			block_max;
 	u64			exec_max;
-	u64			wait_max;
-	u64			last_ran;
 
-	u64			sum_exec_runtime;
-	s64			sum_wait_runtime;
-	s64			sum_sleep_runtime;
 	unsigned long		wait_runtime_overruns;
 	unsigned long		wait_runtime_underruns;
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	struct sched_entity	*parent;
 	/* rq on which this entity is (to be) queued: */
diff --git a/include/linux/topology.h b/include/linux/topology.h
index d0890a7e5ba..525d437b125 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -185,7 +185,6 @@
 	.max_interval		= 64*num_online_cpus(),	\
 	.busy_factor		= 128,			\
 	.imbalance_pct		= 133,			\
-	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
 	.busy_idx		= 3,			\
 	.idle_idx		= 3,			\
diff --git a/kernel/sched.c b/kernel/sched.c
index 238a76957e8..72bb9483d94 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -637,7 +637,7 @@ static u64 div64_likely32(u64 divident, unsigned long divisor)
 
 #define WMULT_SHIFT	32
 
-static inline unsigned long
+static unsigned long
 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		struct load_weight *lw)
 {
@@ -657,7 +657,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 		tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT;
 	}
 
-	return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit);
+	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 }
 
 static inline unsigned long
@@ -678,46 +678,6 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec)
 	lw->inv_weight = 0;
 }
 
-static void __update_curr_load(struct rq *rq, struct load_stat *ls)
-{
-	if (rq->curr != rq->idle && ls->load.weight) {
-		ls->delta_exec += ls->delta_stat;
-		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
-		ls->delta_stat = 0;
-	}
-}
-
-/*
- * Update delta_exec, delta_fair fields for rq.
- *
- * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->ls.load.weight) on the runqueue, while
- * delta_exec advances at the same rate as wall-clock (provided
- * cpu is not idle).
- *
- * delta_exec / delta_fair is a measure of the (smoothened) load on this
- * runqueue over any given interval. This (smoothened) load is used
- * during load balance.
- *
- * This function is called /before/ updating rq->ls.load
- * and when switching tasks.
- */
-static void update_curr_load(struct rq *rq, u64 now)
-{
-	struct load_stat *ls = &rq->ls;
-	u64 start;
-
-	start = ls->load_update_start;
-	ls->load_update_start = now;
-	ls->delta_stat += now - start;
-	/*
-	 * Stagger updates to ls->delta_fair. Very frequent updates
-	 * can be expensive.
-	 */
-	if (ls->delta_stat >= sysctl_sched_stat_granularity)
-		__update_curr_load(rq, ls);
-}
-
 /*
  * To aid in avoiding the subversion of "niceness" due to uneven distribution
  * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -727,19 +687,6 @@ static void update_curr_load(struct rq *rq, u64 now)
  * slice expiry etc.
  */
 
-/*
- * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
- * If static_prio_timeslice() is ever changed to break this assumption then
- * this code will need modification
- */
-#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
-#define load_weight(lp) \
-	(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
-#define PRIO_TO_LOAD_WEIGHT(prio) \
-	load_weight(static_prio_timeslice(prio))
-#define RTPRIO_TO_LOAD_WEIGHT(rp) \
-	(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp))
-
 #define WEIGHT_IDLEPRIO		2
 #define WMULT_IDLEPRIO		(1 << 31)
 
@@ -781,32 +728,6 @@ static const u32 prio_to_wmult[40] = {
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-static inline void
-inc_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
-	update_curr_load(rq, now);
-	update_load_add(&rq->ls.load, p->se.load.weight);
-}
-
-static inline void
-dec_load(struct rq *rq, const struct task_struct *p, u64 now)
-{
-	update_curr_load(rq, now);
-	update_load_sub(&rq->ls.load, p->se.load.weight);
-}
-
-static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
-	rq->nr_running++;
-	inc_load(rq, p, now);
-}
-
-static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
-{
-	rq->nr_running--;
-	dec_load(rq, p, now);
-}
-
 static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
 
 /*
@@ -837,6 +758,72 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 #define sched_class_highest (&rt_sched_class)
 
+static void __update_curr_load(struct rq *rq, struct load_stat *ls)
+{
+	if (rq->curr != rq->idle && ls->load.weight) {
+		ls->delta_exec += ls->delta_stat;
+		ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load);
+		ls->delta_stat = 0;
+	}
+}
+
+/*
+ * Update delta_exec, delta_fair fields for rq.
+ *
+ * delta_fair clock advances at a rate inversely proportional to
+ * total load (rq->ls.load.weight) on the runqueue, while
+ * delta_exec advances at the same rate as wall-clock (provided
+ * cpu is not idle).
+ *
+ * delta_exec / delta_fair is a measure of the (smoothened) load on this
+ * runqueue over any given interval. This (smoothened) load is used
+ * during load balance.
+ *
+ * This function is called /before/ updating rq->ls.load
+ * and when switching tasks.
+ */
+static void update_curr_load(struct rq *rq, u64 now)
+{
+	struct load_stat *ls = &rq->ls;
+	u64 start;
+
+	start = ls->load_update_start;
+	ls->load_update_start = now;
+	ls->delta_stat += now - start;
+	/*
+	 * Stagger updates to ls->delta_fair. Very frequent updates
+	 * can be expensive.
+	 */
+	if (ls->delta_stat >= sysctl_sched_stat_granularity)
+		__update_curr_load(rq, ls);
+}
+
+static inline void
+inc_load(struct rq *rq, const struct task_struct *p, u64 now)
+{
+	update_curr_load(rq, now);
+	update_load_add(&rq->ls.load, p->se.load.weight);
+}
+
+static inline void
+dec_load(struct rq *rq, const struct task_struct *p, u64 now)
+{
+	update_curr_load(rq, now);
+	update_load_sub(&rq->ls.load, p->se.load.weight);
+}
+
+static void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+{
+	rq->nr_running++;
+	inc_load(rq, p, now);
+}
+
+static void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now)
+{
+	rq->nr_running--;
+	dec_load(rq, p, now);
+}
+
 static void set_load_weight(struct task_struct *p)
 {
 	task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime;
@@ -996,18 +983,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	u64 clock_offset, fair_clock_offset;
 
 	clock_offset = old_rq->clock - new_rq->clock;
-	fair_clock_offset = old_rq->cfs.fair_clock -
-						 new_rq->cfs.fair_clock;
-	if (p->se.wait_start)
-		p->se.wait_start -= clock_offset;
+	fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock;
+
 	if (p->se.wait_start_fair)
 		p->se.wait_start_fair -= fair_clock_offset;
+	if (p->se.sleep_start_fair)
+		p->se.sleep_start_fair -= fair_clock_offset;
+
+#ifdef CONFIG_SCHEDSTATS
+	if (p->se.wait_start)
+		p->se.wait_start -= clock_offset;
 	if (p->se.sleep_start)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
-	if (p->se.sleep_start_fair)
-		p->se.sleep_start_fair -= fair_clock_offset;
+#endif
 
 	__set_task_cpu(p, new_cpu);
 }
@@ -1568,17 +1558,19 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state)
 static void __sched_fork(struct task_struct *p)
 {
 	p->se.wait_start_fair		= 0;
-	p->se.wait_start		= 0;
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.delta_exec		= 0;
 	p->se.delta_fair_run		= 0;
 	p->se.delta_fair_sleep		= 0;
 	p->se.wait_runtime		= 0;
+	p->se.sleep_start_fair		= 0;
+
+#ifdef CONFIG_SCHEDSTATS
+	p->se.wait_start		= 0;
 	p->se.sum_wait_runtime		= 0;
 	p->se.sum_sleep_runtime		= 0;
 	p->se.sleep_start		= 0;
-	p->se.sleep_start_fair		= 0;
 	p->se.block_start		= 0;
 	p->se.sleep_max			= 0;
 	p->se.block_max			= 0;
@@ -1586,6 +1578,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.wait_max			= 0;
 	p->se.wait_runtime_overruns	= 0;
 	p->se.wait_runtime_underruns	= 0;
+#endif
 
 	INIT_LIST_HEAD(&p->run_list);
 	p->se.on_rq = 0;
@@ -1654,22 +1647,27 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	unsigned long flags;
 	struct rq *rq;
 	int this_cpu;
+	u64 now;
 
 	rq = task_rq_lock(p, &flags);
 	BUG_ON(p->state != TASK_RUNNING);
 	this_cpu = smp_processor_id(); /* parent's CPU */
+	now = rq_clock(rq);
 
 	p->prio = effective_prio(p);
 
-	if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) ||
-			task_cpu(p) != this_cpu || !current->se.on_rq) {
+	if (!p->sched_class->task_new || !sysctl_sched_child_runs_first ||
+			(clone_flags & CLONE_VM) || task_cpu(p) != this_cpu ||
+			!current->se.on_rq) {
+
 		activate_task(rq, p, 0);
 	} else {
 		/*
 		 * Let the scheduling class do new task startup
 		 * management (if any):
 		 */
-		p->sched_class->task_new(rq, p);
+		p->sched_class->task_new(rq, p, now);
+		inc_nr_running(p, rq, now);
 	}
 	check_preempt_curr(rq, p);
 	task_rq_unlock(rq, &flags);
@@ -2908,8 +2906,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 		schedstat_inc(sd, alb_cnt);
 
 		if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
-			       RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE,
-			       NULL))
+			       ULONG_MAX, sd, CPU_IDLE, NULL))
 			schedstat_inc(sd, alb_pushed);
 		else
 			schedstat_inc(sd, alb_failed);
@@ -5269,8 +5266,6 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
 		sizeof(int), 0644, proc_dointvec_minmax);
 	set_table_entry(&table[8], 9, "imbalance_pct", &sd->imbalance_pct,
 		sizeof(int), 0644, proc_dointvec_minmax);
-	set_table_entry(&table[9], 10, "cache_hot_time", &sd->cache_hot_time,
-		sizeof(long long), 0644, proc_doulongvec_minmax);
 	set_table_entry(&table[10], 11, "cache_nice_tries",
 		&sd->cache_nice_tries,
 		sizeof(int), 0644, proc_dointvec_minmax);
@@ -6590,12 +6585,14 @@ void normalize_rt_tasks(void)
 	do_each_thread(g, p) {
 		p->se.fair_key			= 0;
 		p->se.wait_runtime		= 0;
+		p->se.exec_start		= 0;
 		p->se.wait_start_fair		= 0;
+		p->se.sleep_start_fair		= 0;
+#ifdef CONFIG_SCHEDSTATS
 		p->se.wait_start		= 0;
-		p->se.exec_start		= 0;
 		p->se.sleep_start		= 0;
-		p->se.sleep_start_fair		= 0;
 		p->se.block_start		= 0;
+#endif
 		task_rq(p)->cfs.fair_clock	= 0;
 		task_rq(p)->clock		= 0;
 
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 0eca442b779..1c61e5315ad 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -44,11 +44,16 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p, u64 now)
 		(long long)p->se.wait_runtime,
 		(long long)(p->nvcsw + p->nivcsw),
 		p->prio,
+#ifdef CONFIG_SCHEDSTATS
 		(long long)p->se.sum_exec_runtime,
 		(long long)p->se.sum_wait_runtime,
 		(long long)p->se.sum_sleep_runtime,
 		(long long)p->se.wait_runtime_overruns,
-		(long long)p->se.wait_runtime_underruns);
+		(long long)p->se.wait_runtime_underruns
+#else
+		0LL, 0LL, 0LL, 0LL, 0LL
+#endif
+	);
 }
 
 static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu, u64 now)
@@ -171,7 +176,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.05, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.05-v20, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
@@ -235,21 +240,24 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 #define P(F) \
 	SEQ_printf(m, "%-25s:%20Ld\n", #F, (long long)p->F)
 
-	P(se.wait_start);
+	P(se.wait_runtime);
 	P(se.wait_start_fair);
 	P(se.exec_start);
-	P(se.sleep_start);
 	P(se.sleep_start_fair);
+	P(se.sum_exec_runtime);
+
+#ifdef CONFIG_SCHEDSTATS
+	P(se.wait_start);
+	P(se.sleep_start);
 	P(se.block_start);
 	P(se.sleep_max);
 	P(se.block_max);
 	P(se.exec_max);
 	P(se.wait_max);
-	P(se.wait_runtime);
 	P(se.wait_runtime_overruns);
 	P(se.wait_runtime_underruns);
 	P(se.sum_wait_runtime);
-	P(se.sum_exec_runtime);
+#endif
 	SEQ_printf(m, "%-25s:%20Ld\n",
 		   "nr_switches", (long long)(p->nvcsw + p->nivcsw));
 	P(se.load.weight);
@@ -269,7 +277,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
 void proc_sched_set_task(struct task_struct *p)
 {
+#ifdef CONFIG_SCHEDSTATS
 	p->se.sleep_max = p->se.block_max = p->se.exec_max = p->se.wait_max = 0;
 	p->se.wait_runtime_overruns = p->se.wait_runtime_underruns = 0;
+#endif
 	p->se.sum_exec_runtime = 0;
 }
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6971db0a716..6f579ff5a9b 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -292,10 +292,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, u64 now)
 		return;
 
 	delta_exec = curr->delta_exec;
-#ifdef CONFIG_SCHEDSTATS
-	if (unlikely(delta_exec > curr->exec_max))
-		curr->exec_max = delta_exec;
-#endif
+	schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max));
 
 	curr->sum_exec_runtime += delta_exec;
 	cfs_rq->exec_clock += delta_exec;
@@ -352,7 +349,7 @@ static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	se->wait_start_fair = cfs_rq->fair_clock;
-	se->wait_start = now;
+	schedstat_set(se->wait_start, now);
 }
 
 /*
@@ -425,13 +422,7 @@ __update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 {
 	unsigned long delta_fair = se->delta_fair_run;
 
-#ifdef CONFIG_SCHEDSTATS
-	{
-		s64 delta_wait = now - se->wait_start;
-		if (unlikely(delta_wait > se->wait_max))
-			se->wait_max = delta_wait;
-	}
-#endif
+	schedstat_set(se->wait_max, max(se->wait_max, now - se->wait_start));
 
 	if (unlikely(se->load.weight != NICE_0_LOAD))
 		delta_fair = calc_weighted(delta_fair, se->load.weight,
@@ -456,7 +447,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se, u64 now)
 	}
 
 	se->wait_start_fair = 0;
-	se->wait_start = 0;
+	schedstat_set(se->wait_start, 0);
 }
 
 static inline void
@@ -1041,11 +1032,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr)
  * monopolize the CPU. Note: the parent runqueue is locked,
  * the child is not running yet.
  */
-static void task_new_fair(struct rq *rq, struct task_struct *p)
+static void task_new_fair(struct rq *rq, struct task_struct *p, u64 now)
 {
 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
 	struct sched_entity *se = &p->se;
-	u64 now = rq_clock(rq);
 
 	sched_info_queued(p);
 
@@ -1072,7 +1062,6 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 		p->se.wait_runtime = -(sysctl_sched_granularity / 2);
 
 	__enqueue_entity(cfs_rq, se);
-	inc_nr_running(p, rq, now);
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1192a2741b9..002fcf8d3f6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -18,8 +18,8 @@ static inline void update_curr_rt(struct rq *rq, u64 now)
 	delta_exec = now - curr->se.exec_start;
 	if (unlikely((s64)delta_exec < 0))
 		delta_exec = 0;
-	if (unlikely(delta_exec > curr->se.exec_max))
-		curr->se.exec_max = delta_exec;
+
+	schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
 
 	curr->se.sum_exec_runtime += delta_exec;
 	curr->se.exec_start = now;
@@ -229,15 +229,6 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	requeue_task_rt(rq, p);
 }
 
-/*
- * No parent/child timeslice management necessary for RT tasks,
- * just activate them:
- */
-static void task_new_rt(struct rq *rq, struct task_struct *p)
-{
-	activate_task(rq, p, 1);
-}
-
 static struct sched_class rt_sched_class __read_mostly = {
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
@@ -251,5 +242,4 @@ static struct sched_class rt_sched_class __read_mostly = {
 	.load_balance		= load_balance_rt,
 
 	.task_tick		= task_tick_rt,
-	.task_new		= task_new_rt,
 };
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index c63c38f6fa6..c20a94dda61 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -116,6 +116,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 }
 # define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
+# define schedstat_set(var, val)	do { var = (val); } while (0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
 rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -125,6 +126,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
+# define schedstat_set(var, val)	do { } while (0)
 #endif
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)