From 9778385db35a799d410039be123044a0d3e917a2 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Thu, 15 Nov 2007 20:57:39 +0100 Subject: sched: fix accounting of interrupts during guest execution on s390 Currently the scheduler checks for PF_VCPU to decide if this timeslice has to be accounted as guest time. On s390 host interrupts are not disabled during guest execution. This causes theses interrupts to be accounted as guest time if CONFIG_VIRT_CPU_ACCOUNTING is set. Solution is to check if an interrupt triggered account_system_time. As the tick is timer interrupt based, we have to subtract hardirq_offset. I tested the patch on s390 with CONFIG_VIRT_CPU_ACCOUNTING and on x86_64. Seems to work. CC: Avi Kivity CC: Laurent Vivier Signed-off-by: Christian Borntraeger Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 4fb3532dd7e..908335e1781 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3390,10 +3390,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset, struct rq *rq = this_rq(); cputime64_t tmp; - if (p->flags & PF_VCPU) { - account_guest_time(p, cputime); - return; - } + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) + return account_guest_time(p, cputime); p->stime = cputime_add(p->stime, cputime); -- cgit v1.2.3 From dae51f56204d33444f61d9e7af3ee70aef55daa4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 15 Nov 2007 20:57:40 +0100 Subject: sched: fix SCHED_FIFO tasks & FAIR_GROUP_SCHED Suppose that the SCHED_FIFO task does switch_uid(new_user); Now, p->se.cfs_rq and p->se.parent both point into the old user_struct->tg because sched_move_task() doesn't call set_task_cfs_rq() for !fair_sched_class case. Suppose that old user_struct/task_group is freed/reused, and the task does sched_setscheduler(SCHED_NORMAL); __setscheduler() sets fair_sched_class, but doesn't update ->se.cfs_rq/parent which point to the freed memory. This means that check_preempt_wakeup() doing while (!is_same_group(se, pse)) { se = parent_entity(se); pse = parent_entity(pse); } may OOPS in a similar way if rq->curr or p did something like above. Perhaps we need something like the patch below, note that __setscheduler() can't do set_task_cfs_rq(). Signed-off-by: Oleg Nesterov Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 908335e1781..d62b759753f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7087,8 +7087,10 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); - if (tsk->sched_class != &fair_sched_class) + if (tsk->sched_class != &fair_sched_class) { + set_task_cfs_rq(tsk); goto done; + } update_rq_clock(rq); -- cgit v1.2.3 From ce96b5ac742801718ae86d2adf0500c5abef3782 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 15 Nov 2007 20:57:40 +0100 Subject: sched: fix __set_task_cpu() SMP race Grant Wilson has reported rare SCHED_FAIR_USER crashes on his quad-core system, which crashes can only be explained via runqueue corruption. there is a narrow SMP race in __set_task_cpu(): after ->cpu is set up to a new value, task_rq_lock(p, ...) can be successfuly executed on another CPU. We must ensure that updates of per-task data have been completed by this moment. this bug has been hiding in the Linux scheduler for an eternity (we never had any explicit barrier for task->cpu in set_task_cpu() - so the bug was introduced in 2.5.1), but only became visible via set_task_cfs_rq() being accidentally put after the task->cpu update. It also probably needs a sufficiently out-of-order CPU to trigger. Reported-by: Grant Wilson Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/sched.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index d62b759753f..db1f71e3131 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -216,15 +216,15 @@ static inline struct task_group *task_group(struct task_struct *p) } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_cfs_rq(struct task_struct *p) +static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { - p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)]; - p->se.parent = task_group(p)->se[task_cpu(p)]; + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; } #else -static inline void set_task_cfs_rq(struct task_struct *p) { } +static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -1022,10 +1022,16 @@ unsigned long weighted_cpuload(const int cpu) static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { + set_task_cfs_rq(p, cpu); #ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); task_thread_info(p)->cpu = cpu; #endif - set_task_cfs_rq(p); } #ifdef CONFIG_SMP @@ -7088,7 +7094,7 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); if (tsk->sched_class != &fair_sched_class) { - set_task_cfs_rq(tsk); + set_task_cfs_rq(tsk, task_cpu(tsk)); goto done; } @@ -7103,7 +7109,7 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_class->put_prev_task(rq, tsk); } - set_task_cfs_rq(tsk); + set_task_cfs_rq(tsk, task_cpu(tsk)); if (on_rq) { if (unlikely(running)) -- cgit v1.2.3 From 94bc9a7bd97efdda4dfbe61d0df32ce19d41c0a9 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 15 Nov 2007 20:57:40 +0100 Subject: sched: remove activate_idle_task() cpu_down() code is ok wrt sched_idle_next() placing the 'idle' task not at the beginning of the queue. So get rid of activate_idle_task() and make use of activate_task() instead. It is the same as activate_task(), except for the update_rq_clock(rq) call that is redundant. Code size goes down: text data bss dec hex filename 47853 3934 336 52123 cb9b sched.o.before 47828 3934 336 52098 cb82 sched.o.after Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/sched.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index db1f71e3131..cc8cb6f7d82 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5281,24 +5281,10 @@ static void migrate_live_tasks(int src_cpu) read_unlock(&tasklist_lock); } -/* - * activate_idle_task - move idle task to the _front_ of runqueue. - */ -static void activate_idle_task(struct task_struct *p, struct rq *rq) -{ - update_rq_clock(rq); - - if (p->state == TASK_UNINTERRUPTIBLE) - rq->nr_uninterruptible--; - - enqueue_task(rq, p, 0); - inc_nr_running(p, rq); -} - /* * Schedules idle task to be the next runnable task on current CPU. - * It does so by boosting its priority to highest possible and adding it to - * the _front_ of the runqueue. Used by CPU offline code. + * It does so by boosting its priority to highest possible. + * Used by CPU offline code. */ void sched_idle_next(void) { @@ -5318,8 +5304,8 @@ void sched_idle_next(void) __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - /* Add idle task to the _front_ of its priority queue: */ - activate_idle_task(p, rq); + update_rq_clock(rq); + activate_task(rq, p, 0); spin_unlock_irqrestore(&rq->lock, flags); } -- cgit v1.2.3 From 518b22e990a9071bf508ca67e31b37e7590f499c Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Thu, 15 Nov 2007 20:57:40 +0100 Subject: sched: make sched_nr_latency static sched_nr_latency can now become static. Signed-off-by: Adrian Bunk Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d3c03070872..ee00da284b1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -43,7 +43,7 @@ unsigned int sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -unsigned int sched_nr_latency = 20; +static unsigned int sched_nr_latency = 20; /* * After fork, child runs first. (default) If set to 0 then -- cgit v1.2.3 From 9612633a21ae8424531caf977f0560f64285bf36 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 Nov 2007 20:57:40 +0100 Subject: sched: reorder SCHED_FEAT_ bits reorder SCHED_FEAT_ bits so that the used ones come first. Makes tuning instructions easier. Signed-off-by: Ingo Molnar --- kernel/sched.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cc8cb6f7d82..38933cafea8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -455,18 +455,18 @@ static void update_rq_clock(struct rq *rq) */ enum { SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, - SCHED_FEAT_START_DEBIT = 2, - SCHED_FEAT_TREE_AVG = 4, - SCHED_FEAT_APPROX_AVG = 8, - SCHED_FEAT_WAKEUP_PREEMPT = 16, + SCHED_FEAT_WAKEUP_PREEMPT = 2, + SCHED_FEAT_START_DEBIT = 4, + SCHED_FEAT_TREE_AVG = 8, + SCHED_FEAT_APPROX_AVG = 16, }; const_debug unsigned int sysctl_sched_features = SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | + SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_START_DEBIT * 1 | SCHED_FEAT_TREE_AVG * 0 | - SCHED_FEAT_APPROX_AVG * 0 | - SCHED_FEAT_WAKEUP_PREEMPT * 1; + SCHED_FEAT_APPROX_AVG * 0; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) -- cgit v1.2.3