From af4491e51632d01fbc2b856ffa9ebcd4b38db68c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:02 +0200 Subject: sched: rt-bandwidth for user grouping interface rt_runtime is a signed value Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/user.c b/kernel/user.c index 865ecf57a09..39d6159fae4 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj, { struct user_struct *up = container_of(kobj, struct user_struct, kobj); - return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); + return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg)); } static ssize_t cpu_rt_runtime_store(struct kobject *kobj, @@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj, unsigned long rt_runtime; int rc; - sscanf(buf, "%lu", &rt_runtime); + sscanf(buf, "%ld", &rt_runtime); rc = sched_group_set_rt_runtime(up->tg, rt_runtime); -- cgit v1.2.3 From 6f0d5c390e4206dcb3804a5072a048fdb7d2b428 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:03 +0200 Subject: sched: rt-bandwidth accounting fix It fixes an accounting bug where we would continue accumulating runtime even though the bandwidth control is disabled. This would lead to very long throttle periods once bandwidth control gets turned on again. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 998ba54b454..77340b04a53 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -438,9 +438,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) { u64 runtime = sched_rt_runtime(rt_rq); - if (runtime == RUNTIME_INF) - return 0; - if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); @@ -491,9 +488,11 @@ static void update_curr_rt(struct rq *rq) rt_rq = rt_rq_of_se(rt_se); spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); + if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + } spin_unlock(&rt_rq->rt_runtime_lock); } } -- cgit v1.2.3 From 0b148fa04852859972abbf848177b92daeef138a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:04 +0200 Subject: sched: rt-bandwidth group disable fixes More extensive disable of bandwidth control. It allows sysctl_sched_rt_runtime to disable full group bandwidth control. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++++++- kernel/sched_rt.c | 5 ++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9a1ddb84e26..c1bee5fb815 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,11 +204,13 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } +static inline int rt_bandwidth_enabled(void); + static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -839,6 +841,11 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 77340b04a53..94daace5ee1 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -386,7 +386,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) int i, idle = 1; cpumask_t span; - if (rt_b->rt_runtime == RUNTIME_INF) + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; span = sched_rt_period_mask(); @@ -484,6 +484,9 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); + if (!rt_bandwidth_enabled()) + return; + for_each_sched_rt_entity(rt_se) { rt_rq = rt_rq_of_se(rt_se); -- cgit v1.2.3 From eb755805f21bd5ded84026e167b7a90887ac42e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:05 +0200 Subject: sched: extract walk_tg_tree() Extract walk_tg_tree() and make it a little more generic so we can use it in the schedulablity test. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c1bee5fb815..8c019a19d05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1387,38 +1387,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) +typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; + int ret; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, cpu, sd); + ret = (*down)(parent, data); + if (ret) + goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1426,14 +1412,42 @@ down: up: continue; } - (*up)(parent, cpu, sd); + ret = (*up)(parent, data); + if (ret) + goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; +out_unlock: rcu_read_unlock(); + + return ret; +} + +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; } +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1493,11 +1507,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; + struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1522,6 +1536,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } + + return 0; } /* @@ -1529,10 +1545,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; + long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1543,11 +1559,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) } tg->cfs_rq[cpu]->h_load = load; -} -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ + return 0; } static void update_shares(struct sched_domain *sd) @@ -1557,7 +1570,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@ -1568,9 +1581,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(int cpu) +static void update_h_load(long cpu) { - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else -- cgit v1.2.3 From 9a7e0b180da21885988d47558671cf580279f9d6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:06 +0200 Subject: sched: rt-bandwidth fixes The last patch allows sysctl_sched_rt_runtime to disable bandwidth accounting for the group scheduler - however it doesn't deal with sched_setscheduler(), which will keep tasks out of groups that have no assigned runtime. If we relax this, we get into the situation where RT tasks can get into a group when we disable bandwidth control, and then starve them by enabling it again. Rework the schedulability code to check for this condition and fail to turn on bandwidth control with -EBUSY when this situation is found. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 125 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 63 insertions(+), 62 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 8c019a19d05..e41bdae2778 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -300,9 +300,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED) typedef int (*tg_visitor)(struct task_group *, void *); /* @@ -5082,7 +5082,8 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -8707,73 +8708,77 @@ static DEFINE_MUTEX(rt_constraints_mutex); static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 16; + return 1ULL << 20; - return div64_u64(runtime << 16, period); + return div64_u64(runtime << 20, period); } -#ifdef CONFIG_CGROUP_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_group *tgi, *parent = tg->parent; - unsigned long total = 0; + struct task_struct *g, *p; - if (!parent) { - if (global_rt_period() < period) - return 0; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); - return to_ratio(period, runtime) < - to_ratio(global_rt_period(), global_rt_runtime()); - } + return 0; +} - if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) - return 0; +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &parent->children, siblings) { - if (tgi == tg) - continue; +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; } - rcu_read_unlock(); - return total + to_ratio(period, runtime) <= - to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), - parent->rt_bandwidth.rt_runtime); -} -#elif defined CONFIG_USER_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi; - unsigned long total = 0; - unsigned long global_ratio = - to_ratio(global_rt_period(), global_rt_runtime()); + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) { - if (tgi == tg) - continue; + total = to_ratio(period, runtime); - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); } - rcu_read_unlock(); - return total + to_ratio(period, runtime) < global_ratio; + if (sum > total) + return -EINVAL; + + return 0; } -#endif -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - return 0; + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@ -8783,14 +8788,9 @@ static int tg_set_bandwidth(struct task_group *tg, mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { - err = -EBUSY; + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) goto unlock; - } - if (!__rt_schedulable(tg, rt_period, rt_runtime)) { - err = -EINVAL; - goto unlock; - } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@ -8867,8 +8867,9 @@ static int sched_rt_global_constraints(void) rt_runtime = tg->rt_bandwidth.rt_runtime; mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(tg, rt_period, rt_runtime)) - ret = -EINVAL; + read_lock(&tasklist_lock); + ret = __rt_schedulable(tg, rt_period, rt_runtime); + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; -- cgit v1.2.3 From 7940ca3605b77f20cc6e9852e4ca6f2d725b5653 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Aug 2008 13:40:47 +0200 Subject: sched: extract walk_tg_tree(), fix fix: kernel/sched.c: In function '__rt_schedulable': kernel/sched.c:8771: error: implicit declaration of function 'walk_tg_tree' kernel/sched.c:8771: error: 'tg_nop' undeclared (first use in this function) kernel/sched.c:8771: error: (Each undeclared identifier is reported only once kernel/sched.c:8771: error: for each function it appears in.) Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index e41bdae2778..703f56d5db5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED) +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) typedef int (*tg_visitor)(struct task_group *, void *); /* -- cgit v1.2.3 From c8bfff6dd4d41834f4952cbc49e28e31906a6188 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Fri, 5 Sep 2008 23:46:19 +0200 Subject: sched: compilation fix with gcc 3.4.6 I found that 2.6.27-rc5-mm1 does not compile with gcc 3.4.6. The error is: CC kernel/sched.o kernel/sched.c: In function `start_rt_bandwidth': kernel/sched.c:208: sorry, unimplemented: inlining failed in call to 'rt_bandwidth_enabled': function body not available kernel/sched.c:214: sorry, unimplemented: called from here make[1]: *** [kernel/sched.o] Error 1 make: *** [kernel] Error 2 It seems that the gcc 3.4.6 requires full inline definition before first usage. The patch below fixes the compilation problem. Signed-off-by: Krzysztof Helt (if needed> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 703f56d5db5..4de2bfb28c5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,7 +204,10 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } -static inline int rt_bandwidth_enabled(void); +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { @@ -841,11 +844,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -static inline int rt_bandwidth_enabled(void) -{ - return sysctl_sched_rt_runtime >= 0; -} - #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -- cgit v1.2.3 From e75b986af7881ed8d8ccb1ed154045ed17cfebd0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Sep 2008 21:38:57 +0200 Subject: clockevents: remove WARN_ON which was used to gather information The issue of the endless reprogramming loop due to a too small min_delta_ns was fixed with the previous updates of the clock events code, but we had no information about the spread of this problem. I added a WARN_ON to get automated information via kerneloops.org and to get some direct reports, which allowed me to analyse the affected machines. The WARN_ON has served its purpose and would be annoying for a release kernel. Remove it and just keep the information about the increase of the min_delta_ns value. Signed-off-by: Thomas Gleixner --- kernel/time/tick-oneshot.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 2e35501e61d..2e8de678e76 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -43,19 +43,17 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, * and emit a warning. */ if (++i > 2) { - printk(KERN_WARNING "CE: __tick_program_event of %s is " - "stuck %llx %llx\n", dev->name ? dev->name : "?", - now.tv64, expires.tv64); - printk(KERN_WARNING - "CE: increasing min_delta_ns %ld to %ld nsec\n", - dev->min_delta_ns, dev->min_delta_ns << 1); - WARN_ON(1); - - /* Double the min. delta and try again */ + /* Increase the min. delta and try again */ if (!dev->min_delta_ns) dev->min_delta_ns = 5000; else - dev->min_delta_ns <<= 1; + dev->min_delta_ns += dev->min_delta_ns >> 1; + + printk(KERN_WARNING + "CE: %s increasing min_delta_ns to %lu nsec\n", + dev->name ? dev->name : "?", + dev->min_delta_ns << 1); + i = 0; } -- cgit v1.2.3 From baf25731e54d06eb13dc4eda78c6dc7da4ce84e8 Mon Sep 17 00:00:00 2001 From: "Zhang, Yanmin" Date: Tue, 9 Sep 2008 11:26:33 +0800 Subject: sched: fix 2.6.27-rc5 couldn't boot on tulsa machine randomly On my tulsa x86-64 machine, kernel 2.6.25-rc5 couldn't boot randomly. Basically, function __enable_runtime forgets to reset rt_rq->rt_throttled to 0. When every cpu is up, per-cpu migration_thread is created and it runs very fast, sometimes to mark the corresponding rt_rq->rt_throttled to 1 very quickly. After all cpus are up, with below calling chain: sched_init_smp => arch_init_sched_domains => build_sched_domains => ... => cpu_attach_domain => rq_attach_root => set_rq_online => ... => _enable_runtime _enable_runtime is called against every rt_rq again, so rt_rq->rt_time is reset to 0, but rt_rq->rt_throttled might be still 1. Later on function do_sched_rt_period_timer couldn't reset it, and all RT tasks couldn't be scheduled to run on that cpu. here is RT task migration_thread which is woken up when a task is migrated to another cpu. Below patch fixes it against 2.6.27-rc5. Signed-off-by: Zhang Yanmin Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 552310798da..1113157b205 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -350,6 +350,7 @@ static void __enable_runtime(struct rq *rq) spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_runtime = rt_b->rt_runtime; rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; spin_unlock(&rt_rq->rt_runtime_lock); spin_unlock(&rt_b->rt_runtime_lock); } -- cgit v1.2.3 From ec5d498991e87c74730509508b25c3959192b7e7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 10 Sep 2008 17:00:19 -0700 Subject: sched: fix deadlock in setting scheduler parameter to zero Andrei Gusev wrote: > I played witch scheduler settings. After doing something like: > echo -n 1000000 >sched_rt_period_us > > command is locked. I found in kernel.log: > > Sep 11 00:39:34 zaratustra > Sep 11 00:39:34 zaratustra Pid: 4495, comm: bash Tainted: G W > (2.6.26.3 #12) > Sep 11 00:39:34 zaratustra EIP: 0060:[] EFLAGS: 00210246 CPU: 0 > Sep 11 00:39:34 zaratustra EIP is at div64_u64+0x57/0x80 > Sep 11 00:39:34 zaratustra EAX: 0000389f EBX: 00000000 ECX: 00000000 > EDX: 00000000 > Sep 11 00:39:34 zaratustra ESI: d9800000 EDI: d9800000 EBP: 0000389f > ESP: ea7a6edc > Sep 11 00:39:34 zaratustra DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 > Sep 11 00:39:34 zaratustra Process bash (pid: 4495, ti=ea7a6000 > task=ea744000 task.ti=ea7a6000) > Sep 11 00:39:34 zaratustra Stack: 00000000 000003e8 d9800000 0000389f > c0119042 00000000 00000000 00000001 > Sep 11 00:39:34 zaratustra 00000000 00000000 ea7a6f54 00010000 00000000 > c04d2e80 00000001 000e7ef0 > Sep 11 00:39:34 zaratustra c01191a3 00000000 00000000 ea7a6fa0 00000001 > ffffffff c04d2e80 ea5b2480 > Sep 11 00:39:34 zaratustra Call Trace: > Sep 11 00:39:34 zaratustra [] __rt_schedulable+0x52/0x130 > Sep 11 00:39:34 zaratustra [] sched_rt_handler+0x83/0x120 > Sep 11 00:39:34 zaratustra [] proc_sys_call_handler+0xb6/0xd0 > Sep 11 00:39:34 zaratustra [] proc_sys_write+0x0/0x20 > Sep 11 00:39:34 zaratustra [] proc_sys_write+0x19/0x20 > Sep 11 00:39:34 zaratustra [] vfs_write+0xa8/0x140 > Sep 11 00:39:34 zaratustra [] sys_write+0x41/0x80 > Sep 11 00:39:34 zaratustra [] sysenter_past_esp+0x6a/0x91 > Sep 11 00:39:34 zaratustra ======================= > Sep 11 00:39:34 zaratustra Code: c8 41 0f ad f3 d3 ee f6 c1 20 0f 45 de > 31 f6 0f ad ef d3 ed f6 c1 20 0f 45 fd 0f 45 ee 31 c9 39 eb 89 fe 89 ea > 77 08 89 e8 31 d2 f3 89 c1 89 f0 8b 7c 24 08 f7 f3 8b 74 24 04 89 > ca 8b 1c 24 > Sep 11 00:39:34 zaratustra EIP: [] div64_u64+0x57/0x80 SS:ESP > 0068:ea7a6edc > Sep 11 00:39:34 zaratustra ---[ end trace 4eaa2a86a8e2da22 ]--- fix the boundary condition. sysctl_sched_rt_period=0 makes exception at to_ratio(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cc1f81b50b8..98890807375 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8909,6 +8909,9 @@ static int sched_rt_global_constraints(void) u64 rt_runtime, rt_period; int ret = 0; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); rt_runtime = tg->rt_bandwidth.rt_runtime; @@ -8925,6 +8928,9 @@ static int sched_rt_global_constraints(void) unsigned long flags; int i; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; -- cgit v1.2.3 From 4e74339af6a59c061cf02f1ac497766bca1de19a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 13 Sep 2008 02:33:08 -0700 Subject: cpuset: avoid changing cpuset's cpus when -errno returned After the patch: commit 0b2f630a28d53b5a2082a5275bc3334b10373508 Author: Miao Xie Date: Fri Jul 25 01:47:21 2008 -0700 cpusets: restructure the function update_cpumask() and update_nodemask() It might happen that 'echo 0 > /cpuset/sub/cpus' returned failure but 'cpus' has been changed, because cpus was changed before calling heap_init() which may return -ENOMEM. This patch restores the orginal behavior. Signed-off-by: Li Zefan Acked-by: Paul Menage Cc: Paul Jackson Cc: Miao Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f227bc17269..827cd9adccb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -843,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk, /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * * Called with cgroup_mutex held * * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * Return 0 if successful, -errno if not. + * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * if @heap != NULL. */ -static int update_tasks_cpumask(struct cpuset *cs) +static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { struct cgroup_scanner scan; - struct ptr_heap heap; - int retval; - - /* - * cgroup_scan_tasks() will initialize heap->gt for us. - * heap_init() is still needed here for we should not change - * cs->cpus_allowed when heap_init() fails. - */ - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); - if (retval) - return retval; scan.cg = cs->css.cgroup; scan.test_task = cpuset_test_cpumask; scan.process_task = cpuset_change_cpumask; - scan.heap = &heap; - retval = cgroup_scan_tasks(&scan); - - heap_free(&heap); - return retval; + scan.heap = heap; + cgroup_scan_tasks(&scan); } /** @@ -883,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs) */ static int update_cpumask(struct cpuset *cs, const char *buf) { + struct ptr_heap heap; struct cpuset trialcs; int retval; int is_load_balanced; @@ -917,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf) if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) return 0; + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); + if (retval) + return retval; + is_load_balanced = is_sched_load_balance(&trialcs); mutex_lock(&callback_mutex); @@ -927,9 +920,9 @@ static int update_cpumask(struct cpuset *cs, const char *buf) * Scan tasks in the cpuset, and update the cpumasks of any * that need an update. */ - retval = update_tasks_cpumask(cs); - if (retval < 0) - return retval; + update_tasks_cpumask(cs, &heap); + + heap_free(&heap); if (is_load_balanced) async_rebuild_sched_domains(); @@ -1965,7 +1958,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root) nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); else { - update_tasks_cpumask(cp); + update_tasks_cpumask(cp, NULL); update_tasks_nodemask(cp, &oldmems); } } -- cgit v1.2.3 From 2344abbcbdb82140050e8be29d3d55e4f6fe860b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 Sep 2008 11:32:50 -0700 Subject: clockevents: make device shutdown robust The device shut down does not cleanup the next_event variable of the clock event device. So when the device is reactivated the possible stale next_event value can prevent the device to be reprogrammed as it claims to wait on a event already. This is the root cause of the resurfacing suspend/resume problem, where systems need key press to come back to life. Fix this by setting next_event to KTIME_MAX when the device is shut down. Use a separate function for shutdown which takes care of that and only keep the direct set mode call in the broadcast code, where we can not touch the next_event value. Signed-off-by: Thomas Gleixner --- kernel/time/clockevents.c | 12 +++++++++++- kernel/time/tick-broadcast.c | 9 ++++----- kernel/time/tick-common.c | 4 ++-- kernel/time/tick-internal.h | 2 ++ 4 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1876b526c77..f8d968063ce 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -71,6 +71,16 @@ void clockevents_set_mode(struct clock_event_device *dev, } } +/** + * clockevents_shutdown - shutdown the device and clear next_event + * @dev: device to shutdown + */ +void clockevents_shutdown(struct clock_event_device *dev) +{ + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + dev->next_event.tv64 = KTIME_MAX; +} + /** * clockevents_program_event - Reprogram the clock event device. * @expires: absolute expiry time (monotonic clock) @@ -206,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old, if (new) { BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); - clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(new); } local_irq_restore(flags); } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2f5a38294bf..f1f3eee2811 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -236,8 +236,7 @@ static void tick_do_broadcast_on_off(void *why) if (!cpu_isset(cpu, tick_broadcast_mask)) { cpu_set(cpu, tick_broadcast_mask); if (td->mode == TICKDEV_MODE_PERIODIC) - clockevents_set_mode(dev, - CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(dev); } if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) tick_broadcast_force = 1; @@ -254,7 +253,7 @@ static void tick_do_broadcast_on_off(void *why) if (cpus_empty(tick_broadcast_mask)) { if (!bc_stopped) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); @@ -306,7 +305,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { if (bc && cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); } spin_unlock_irqrestore(&tick_broadcast_lock, flags); @@ -321,7 +320,7 @@ void tick_suspend_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(bc); spin_unlock_irqrestore(&tick_broadcast_lock, flags); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index c4777193d56..019315ebf9d 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -249,7 +249,7 @@ static int tick_check_new_device(struct clock_event_device *newdev) * not give it back to the clockevents layer ! */ if (tick_is_broadcast_device(curdev)) { - clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(curdev); curdev = NULL; } clockevents_exchange_device(curdev, newdev); @@ -311,7 +311,7 @@ static void tick_suspend(void) unsigned long flags; spin_lock_irqsave(&tick_device_lock, flags); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_shutdown(td->evtdev); spin_unlock_irqrestore(&tick_device_lock, flags); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 0ffc2918ea6..6e9db9734aa 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -10,6 +10,8 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); +extern void clockevents_shutdown(struct clock_event_device *dev); + /* * NO_HZ / high resolution timer shared code */ -- cgit v1.2.3 From fa748203175de7c08f2df80e5a0eeca40329b5e2 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Mon, 22 Sep 2008 14:55:45 -0700 Subject: sched: fix init_hrtick() section mismatch warning LD kernel/built-in.o WARNING: kernel/built-in.o(.text+0x326): Section mismatch in reference from the function init_hrtick() to the variable .cpuinit.data:hotplug_hrtick_nb.8 The function init_hrtick() references the variable __cpuinitdata hotplug_hrtick_nb.8. This is often because init_hrtick lacks a __cpuinitdata annotation or the annotation of hotplug_hrtick_nb.8 is wrong. Signed-off-by: Md.Rakib H. Mullick Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 98890807375..13dd2db9fb2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1087,7 +1087,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_DONE; } -static void init_hrtick(void) +static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } -- cgit v1.2.3