From 0b148fa04852859972abbf848177b92daeef138a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:04 +0200 Subject: sched: rt-bandwidth group disable fixes More extensive disable of bandwidth control. It allows sysctl_sched_rt_runtime to disable full group bandwidth control. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 9a1ddb84e26..c1bee5fb815 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,11 +204,13 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } +static inline int rt_bandwidth_enabled(void); + static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { ktime_t now; - if (rt_b->rt_runtime == RUNTIME_INF) + if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) return; if (hrtimer_active(&rt_b->rt_period_timer)) @@ -839,6 +841,11 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -- cgit v1.2.3 From eb755805f21bd5ded84026e167b7a90887ac42e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:05 +0200 Subject: sched: extract walk_tg_tree() Extract walk_tg_tree() and make it a little more generic so we can use it in the schedulablity test. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 33 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index c1bee5fb815..8c019a19d05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1387,38 +1387,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); -static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); - -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - - if (rq->nr_running) - rq->avg_load_per_task = rq->load.weight / rq->nr_running; - - return rq->avg_load_per_task; -} - -#ifdef CONFIG_FAIR_GROUP_SCHED - -typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) +typedef int (*tg_visitor)(struct task_group *, void *); /* * Iterate the full tree, calling @down when first entering a node and @up when * leaving it for the final time. */ -static void -walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) { struct task_group *parent, *child; + int ret; rcu_read_lock(); parent = &root_task_group; down: - (*down)(parent, cpu, sd); + ret = (*down)(parent, data); + if (ret) + goto out_unlock; list_for_each_entry_rcu(child, &parent->children, siblings) { parent = child; goto down; @@ -1426,14 +1412,42 @@ down: up: continue; } - (*up)(parent, cpu, sd); + ret = (*up)(parent, data); + if (ret) + goto out_unlock; child = parent; parent = parent->parent; if (parent) goto up; +out_unlock: rcu_read_unlock(); + + return ret; +} + +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; } +#endif + +#ifdef CONFIG_SMP +static unsigned long source_load(int cpu, int type); +static unsigned long target_load(int cpu, int type); +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (rq->nr_running) + rq->avg_load_per_task = rq->load.weight / rq->nr_running; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -1493,11 +1507,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, * This needs to be done in a bottom-up fashion because the rq weight of a * parent group depends on the shares of its child groups. */ -static void -tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_shares_up(struct task_group *tg, void *data) { unsigned long rq_weight = 0; unsigned long shares = 0; + struct sched_domain *sd = data; int i; for_each_cpu_mask(i, sd->span) { @@ -1522,6 +1536,8 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) __update_group_shares_cpu(tg, i, shares, rq_weight); spin_unlock_irqrestore(&rq->lock, flags); } + + return 0; } /* @@ -1529,10 +1545,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static void -tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) +static int tg_load_down(struct task_group *tg, void *data) { unsigned long load; + long cpu = (long)data; if (!tg->parent) { load = cpu_rq(cpu)->load.weight; @@ -1543,11 +1559,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) } tg->cfs_rq[cpu]->h_load = load; -} -static void -tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) -{ + return 0; } static void update_shares(struct sched_domain *sd) @@ -1557,7 +1570,7 @@ static void update_shares(struct sched_domain *sd) if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; - walk_tg_tree(tg_nop, tg_shares_up, 0, sd); + walk_tg_tree(tg_nop, tg_shares_up, sd); } } @@ -1568,9 +1581,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) spin_lock(&rq->lock); } -static void update_h_load(int cpu) +static void update_h_load(long cpu) { - walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } #else -- cgit v1.2.3 From 9a7e0b180da21885988d47558671cf580279f9d6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 19 Aug 2008 12:33:06 +0200 Subject: sched: rt-bandwidth fixes The last patch allows sysctl_sched_rt_runtime to disable bandwidth accounting for the group scheduler - however it doesn't deal with sched_setscheduler(), which will keep tasks out of groups that have no assigned runtime. If we relax this, we get into the situation where RT tasks can get into a group when we disable bandwidth control, and then starve them by enabling it again. Rework the schedulability code to check for this condition and fail to turn on bandwidth control with -EBUSY when this situation is found. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 125 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 63 insertions(+), 62 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 8c019a19d05..e41bdae2778 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -300,9 +300,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_RT_GROUP_SCHED */ -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_USER_SCHED */ #define root_task_group init_task_group -#endif /* CONFIG_FAIR_GROUP_SCHED */ +#endif /* CONFIG_USER_SCHED */ /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED) typedef int (*tg_visitor)(struct task_group *, void *); /* @@ -5082,7 +5082,8 @@ recheck: * Do not allow realtime tasks into groups that have no runtime * assigned. */ - if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) return -EPERM; #endif @@ -8707,73 +8708,77 @@ static DEFINE_MUTEX(rt_constraints_mutex); static unsigned long to_ratio(u64 period, u64 runtime) { if (runtime == RUNTIME_INF) - return 1ULL << 16; + return 1ULL << 20; - return div64_u64(runtime << 16, period); + return div64_u64(runtime << 20, period); } -#ifdef CONFIG_CGROUP_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) { - struct task_group *tgi, *parent = tg->parent; - unsigned long total = 0; + struct task_struct *g, *p; - if (!parent) { - if (global_rt_period() < period) - return 0; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); - return to_ratio(period, runtime) < - to_ratio(global_rt_period(), global_rt_runtime()); - } + return 0; +} - if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) - return 0; +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &parent->children, siblings) { - if (tgi == tg) - continue; +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; } - rcu_read_unlock(); - return total + to_ratio(period, runtime) <= - to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), - parent->rt_bandwidth.rt_runtime); -} -#elif defined CONFIG_USER_SCHED -static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) -{ - struct task_group *tgi; - unsigned long total = 0; - unsigned long global_ratio = - to_ratio(global_rt_period(), global_rt_runtime()); + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; - rcu_read_lock(); - list_for_each_entry_rcu(tgi, &task_groups, list) { - if (tgi == tg) - continue; + total = to_ratio(period, runtime); - total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), - tgi->rt_bandwidth.rt_runtime); + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); } - rcu_read_unlock(); - return total + to_ratio(period, runtime) < global_ratio; + if (sum > total) + return -EINVAL; + + return 0; } -#endif -/* Must be called with tasklist_lock held */ -static inline int tg_has_rt_tasks(struct task_group *tg) +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) { - struct task_struct *g, *p; - do_each_thread(g, p) { - if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) - return 1; - } while_each_thread(g, p); - return 0; + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); } static int tg_set_bandwidth(struct task_group *tg, @@ -8783,14 +8788,9 @@ static int tg_set_bandwidth(struct task_group *tg, mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { - err = -EBUSY; + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) goto unlock; - } - if (!__rt_schedulable(tg, rt_period, rt_runtime)) { - err = -EINVAL; - goto unlock; - } spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); @@ -8867,8 +8867,9 @@ static int sched_rt_global_constraints(void) rt_runtime = tg->rt_bandwidth.rt_runtime; mutex_lock(&rt_constraints_mutex); - if (!__rt_schedulable(tg, rt_period, rt_runtime)) - ret = -EINVAL; + read_lock(&tasklist_lock); + ret = __rt_schedulable(tg, rt_period, rt_runtime); + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return ret; -- cgit v1.2.3 From 7940ca3605b77f20cc6e9852e4ca6f2d725b5653 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Aug 2008 13:40:47 +0200 Subject: sched: extract walk_tg_tree(), fix fix: kernel/sched.c: In function '__rt_schedulable': kernel/sched.c:8771: error: implicit declaration of function 'walk_tg_tree' kernel/sched.c:8771: error: 'tg_nop' undeclared (first use in this function) kernel/sched.c:8771: error: (Each undeclared identifier is reported only once kernel/sched.c:8771: error: for each function it appears in.) Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e41bdae2778..703f56d5db5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1387,7 +1387,7 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load) update_load_sub(&rq->load, load); } -#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(SCHED_RT_GROUP_SCHED) +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) typedef int (*tg_visitor)(struct task_group *, void *); /* -- cgit v1.2.3 From c8bfff6dd4d41834f4952cbc49e28e31906a6188 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Fri, 5 Sep 2008 23:46:19 +0200 Subject: sched: compilation fix with gcc 3.4.6 I found that 2.6.27-rc5-mm1 does not compile with gcc 3.4.6. The error is: CC kernel/sched.o kernel/sched.c: In function `start_rt_bandwidth': kernel/sched.c:208: sorry, unimplemented: inlining failed in call to 'rt_bandwidth_enabled': function body not available kernel/sched.c:214: sorry, unimplemented: called from here make[1]: *** [kernel/sched.o] Error 1 make: *** [kernel] Error 2 It seems that the gcc 3.4.6 requires full inline definition before first usage. The patch below fixes the compilation problem. Signed-off-by: Krzysztof Helt (if needed> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 703f56d5db5..4de2bfb28c5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -204,7 +204,10 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; } -static inline int rt_bandwidth_enabled(void); +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} static void start_rt_bandwidth(struct rt_bandwidth *rt_b) { @@ -841,11 +844,6 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -static inline int rt_bandwidth_enabled(void) -{ - return sysctl_sched_rt_runtime >= 0; -} - #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif -- cgit v1.2.3 From ec5d498991e87c74730509508b25c3959192b7e7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 10 Sep 2008 17:00:19 -0700 Subject: sched: fix deadlock in setting scheduler parameter to zero Andrei Gusev wrote: > I played witch scheduler settings. After doing something like: > echo -n 1000000 >sched_rt_period_us > > command is locked. I found in kernel.log: > > Sep 11 00:39:34 zaratustra > Sep 11 00:39:34 zaratustra Pid: 4495, comm: bash Tainted: G W > (2.6.26.3 #12) > Sep 11 00:39:34 zaratustra EIP: 0060:[] EFLAGS: 00210246 CPU: 0 > Sep 11 00:39:34 zaratustra EIP is at div64_u64+0x57/0x80 > Sep 11 00:39:34 zaratustra EAX: 0000389f EBX: 00000000 ECX: 00000000 > EDX: 00000000 > Sep 11 00:39:34 zaratustra ESI: d9800000 EDI: d9800000 EBP: 0000389f > ESP: ea7a6edc > Sep 11 00:39:34 zaratustra DS: 007b ES: 007b FS: 0000 GS: 0033 SS: 0068 > Sep 11 00:39:34 zaratustra Process bash (pid: 4495, ti=ea7a6000 > task=ea744000 task.ti=ea7a6000) > Sep 11 00:39:34 zaratustra Stack: 00000000 000003e8 d9800000 0000389f > c0119042 00000000 00000000 00000001 > Sep 11 00:39:34 zaratustra 00000000 00000000 ea7a6f54 00010000 00000000 > c04d2e80 00000001 000e7ef0 > Sep 11 00:39:34 zaratustra c01191a3 00000000 00000000 ea7a6fa0 00000001 > ffffffff c04d2e80 ea5b2480 > Sep 11 00:39:34 zaratustra Call Trace: > Sep 11 00:39:34 zaratustra [] __rt_schedulable+0x52/0x130 > Sep 11 00:39:34 zaratustra [] sched_rt_handler+0x83/0x120 > Sep 11 00:39:34 zaratustra [] proc_sys_call_handler+0xb6/0xd0 > Sep 11 00:39:34 zaratustra [] proc_sys_write+0x0/0x20 > Sep 11 00:39:34 zaratustra [] proc_sys_write+0x19/0x20 > Sep 11 00:39:34 zaratustra [] vfs_write+0xa8/0x140 > Sep 11 00:39:34 zaratustra [] sys_write+0x41/0x80 > Sep 11 00:39:34 zaratustra [] sysenter_past_esp+0x6a/0x91 > Sep 11 00:39:34 zaratustra ======================= > Sep 11 00:39:34 zaratustra Code: c8 41 0f ad f3 d3 ee f6 c1 20 0f 45 de > 31 f6 0f ad ef d3 ed f6 c1 20 0f 45 fd 0f 45 ee 31 c9 39 eb 89 fe 89 ea > 77 08 89 e8 31 d2 f3 89 c1 89 f0 8b 7c 24 08 f7 f3 8b 74 24 04 89 > ca 8b 1c 24 > Sep 11 00:39:34 zaratustra EIP: [] div64_u64+0x57/0x80 SS:ESP > 0068:ea7a6edc > Sep 11 00:39:34 zaratustra ---[ end trace 4eaa2a86a8e2da22 ]--- fix the boundary condition. sysctl_sched_rt_period=0 makes exception at to_ratio(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- kernel/sched.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index cc1f81b50b8..98890807375 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8909,6 +8909,9 @@ static int sched_rt_global_constraints(void) u64 rt_runtime, rt_period; int ret = 0; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); rt_runtime = tg->rt_bandwidth.rt_runtime; @@ -8925,6 +8928,9 @@ static int sched_rt_global_constraints(void) unsigned long flags; int i; + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; -- cgit v1.2.3 From fa748203175de7c08f2df80e5a0eeca40329b5e2 Mon Sep 17 00:00:00 2001 From: Rakib Mullick Date: Mon, 22 Sep 2008 14:55:45 -0700 Subject: sched: fix init_hrtick() section mismatch warning LD kernel/built-in.o WARNING: kernel/built-in.o(.text+0x326): Section mismatch in reference from the function init_hrtick() to the variable .cpuinit.data:hotplug_hrtick_nb.8 The function init_hrtick() references the variable __cpuinitdata hotplug_hrtick_nb.8. This is often because init_hrtick lacks a __cpuinitdata annotation or the annotation of hotplug_hrtick_nb.8 is wrong. Signed-off-by: Md.Rakib H. Mullick Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 98890807375..13dd2db9fb2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1087,7 +1087,7 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) return NOTIFY_DONE; } -static void init_hrtick(void) +static __init void init_hrtick(void) { hotcpu_notifier(hotplug_hrtick, 0); } -- cgit v1.2.3