From 67bb6c036d1fc3d332c8527a36a546e3e72e822c Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:35 +0530 Subject: sched: Simple helper functions for find_busiest_group() Impact: cleanup Currently the load idx calculation code is in find_busiest_group(). Move that to a static inline helper function. Similary, to find the first cpu of a sched_group we use cpumask_first(sched_group_cpus(group)) Use a helper to that. It improves readability in some cases. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091335.13992.55424.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 55 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 7b389c74f8f..6aec1e7a72a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3189,6 +3189,43 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } +/********** Helpers for find_busiest_group ************************/ + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + +/** + * get_sd_load_idx - Obtain the load index for a given sched domain. + * @sd: The sched_domain whose load_idx is to be obtained. + * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + */ +static inline int get_sd_load_idx(struct sched_domain *sd, + enum cpu_idle_type idle) +{ + int load_idx; + + switch (idle) { + case CPU_NOT_IDLE: + load_idx = sd->busy_idx; + break; + + case CPU_NEWLY_IDLE: + load_idx = sd->newidle_idx; + break; + default: + load_idx = sd->idle_idx; + break; + } + + return load_idx; +} +/******* find_busiest_group() helpers end here *********************/ /* * find_busiest_group finds and returns the busiest CPU group within the @@ -3217,12 +3254,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; - if (idle == CPU_NOT_IDLE) - load_idx = sd->busy_idx; - else if (idle == CPU_NEWLY_IDLE) - load_idx = sd->newidle_idx; - else - load_idx = sd->idle_idx; + load_idx = get_sd_load_idx(sd, idle); do { unsigned long load, group_capacity, max_cpu_load, min_cpu_load; @@ -3238,7 +3270,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sched_group_cpus(group)); if (local_group) - balance_cpu = cpumask_first(sched_group_cpus(group)); + balance_cpu = group_first_cpu(group); /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -3359,8 +3391,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if ((sum_nr_running < min_nr_running) || (sum_nr_running == min_nr_running && - cpumask_first(sched_group_cpus(group)) > - cpumask_first(sched_group_cpus(group_min)))) { + group_first_cpu(group) > group_first_cpu(group_min))) { group_min = group; min_nr_running = sum_nr_running; min_load_per_task = sum_weighted_load / @@ -3375,8 +3406,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sum_nr_running <= group_capacity - 1) { if (sum_nr_running > leader_nr_running || (sum_nr_running == leader_nr_running && - cpumask_first(sched_group_cpus(group)) < - cpumask_first(sched_group_cpus(group_leader)))) { + group_first_cpu(group) < + group_first_cpu(group_leader))) { group_leader = group; leader_nr_running = sum_nr_running; } @@ -3504,7 +3535,7 @@ out_balanced: *imbalance = min_load_per_task; if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - cpumask_first(sched_group_cpus(group_leader)); + group_first_cpu(group_leader); } return group_min; } -- cgit v1.2.3 From 6dfdb0629019f307ab18864b1fd3e5dbb02f383c Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:40 +0530 Subject: sched: Fix indentations in find_busiest_group() using gotos Impact: cleanup Some indentations in find_busiest_group() can minimized by using early exits with the help of gotos. This improves readability in a couple of cases. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091340.13992.45062.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 6aec1e7a72a..f87adbe999e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3403,14 +3403,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * capacity but still has some space to pick up some load * from other group and save more power */ - if (sum_nr_running <= group_capacity - 1) { - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && - group_first_cpu(group) < - group_first_cpu(group_leader))) { - group_leader = group; - leader_nr_running = sum_nr_running; - } + if (sum_nr_running > group_capacity - 1) + goto group_next; + + if (sum_nr_running > leader_nr_running || + (sum_nr_running == leader_nr_running && + group_first_cpu(group) < group_first_cpu(group_leader))) { + group_leader = group; + leader_nr_running = sum_nr_running; } group_next: #endif @@ -3531,14 +3531,16 @@ out_balanced: if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto ret; - if (this == group_leader && group_leader != group_min) { - *imbalance = min_load_per_task; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(group_leader); - } - return group_min; + if (this != group_leader || group_leader == group_min) + goto ret; + + *imbalance = min_load_per_task; + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + group_first_cpu(group_leader); } + return group_min; + #endif ret: *imbalance = 0; -- cgit v1.2.3 From 381be78fdc829a22f6327a0ed09f54b6270a976d Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:46 +0530 Subject: sched: Define structure to store the sched_group statistics for fbg() Impact: cleanup Currently a whole bunch of variables are used to store the various statistics pertaining to the groups we iterate over in find_busiest_group(). Group them together in a single data structure and add appropriate comments. This will be useful later on when we create helper functions to calculate the sched_group statistics. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091345.13992.20099.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index f87adbe999e..109db122de5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3191,6 +3191,18 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, } /********** Helpers for find_busiest_group ************************/ +/** + * sg_lb_stats - stats of a sched_group required for load_balancing + */ +struct sg_lb_stats { + unsigned long avg_load; /*Avg load across the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long sum_nr_running; /* Nr tasks running in the group */ + unsigned long sum_weighted_load; /* Weighted load of group's tasks */ + unsigned long group_capacity; + int group_imb; /* Is there an imbalance in the group ? */ +}; + /** * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. * @group: The group whose first cpu is to be returned. @@ -3257,23 +3269,22 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, load_idx = get_sd_load_idx(sd, idle); do { - unsigned long load, group_capacity, max_cpu_load, min_cpu_load; + struct sg_lb_stats sgs; + unsigned long load, max_cpu_load, min_cpu_load; int local_group; int i; - int __group_imb = 0; unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long sum_nr_running, sum_weighted_load; unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); + memset(&sgs, 0, sizeof(sgs)); if (local_group) balance_cpu = group_first_cpu(group); /* Tally up the load of all CPUs in the group */ - sum_weighted_load = sum_nr_running = avg_load = 0; sum_avg_load_per_task = avg_load_per_task = 0; max_cpu_load = 0; @@ -3301,9 +3312,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, min_cpu_load = load; } - avg_load += load; - sum_nr_running += rq->nr_running; - sum_weighted_load += weighted_cpuload(i); + sgs.group_load += load; + sgs.sum_nr_running += rq->nr_running; + sgs.sum_weighted_load += weighted_cpuload(i); sum_avg_load_per_task += cpu_avg_load_per_task(i); } @@ -3320,12 +3331,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, goto ret; } - total_load += avg_load; + total_load += sgs.group_load; total_pwr += group->__cpu_power; /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); + sgs.avg_load = sg_div_cpu_power(group, + sgs.group_load * SCHED_LOAD_SCALE); /* @@ -3341,22 +3352,23 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sum_avg_load_per_task * SCHED_LOAD_SCALE); if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) - __group_imb = 1; + sgs.group_imb = 1; - group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; + sgs.group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) { - this_load = avg_load; + this_load = sgs.avg_load; this = group; - this_nr_running = sum_nr_running; - this_load_per_task = sum_weighted_load; - } else if (avg_load > max_load && - (sum_nr_running > group_capacity || __group_imb)) { - max_load = avg_load; + this_nr_running = sgs.sum_nr_running; + this_load_per_task = sgs.sum_weighted_load; + } else if (sgs.avg_load > max_load && + (sgs.sum_nr_running > sgs.group_capacity || + sgs.group_imb)) { + max_load = sgs.avg_load; busiest = group; - busiest_nr_running = sum_nr_running; - busiest_load_per_task = sum_weighted_load; - group_imb = __group_imb; + busiest_nr_running = sgs.sum_nr_running; + busiest_load_per_task = sgs.sum_weighted_load; + group_imb = sgs.group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -3372,7 +3384,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * If the local group is idle or completely loaded * no need to do power savings balance at this domain */ - if (local_group && (this_nr_running >= group_capacity || + if (local_group && (this_nr_running >= sgs.group_capacity || !this_nr_running)) power_savings_balance = 0; @@ -3380,8 +3392,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * If a group is already running at full capacity or idle, * don't include that group in power savings calculations */ - if (!power_savings_balance || sum_nr_running >= group_capacity - || !sum_nr_running) + if (!power_savings_balance || + sgs.sum_nr_running >= sgs.group_capacity || + !sgs.sum_nr_running) goto group_next; /* @@ -3389,13 +3402,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * This is the group from where we need to pick up the load * for saving power */ - if ((sum_nr_running < min_nr_running) || - (sum_nr_running == min_nr_running && + if ((sgs.sum_nr_running < min_nr_running) || + (sgs.sum_nr_running == min_nr_running && group_first_cpu(group) > group_first_cpu(group_min))) { group_min = group; - min_nr_running = sum_nr_running; - min_load_per_task = sum_weighted_load / - sum_nr_running; + min_nr_running = sgs.sum_nr_running; + min_load_per_task = sgs.sum_weighted_load / + sgs.sum_nr_running; } /* @@ -3403,14 +3416,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * capacity but still has some space to pick up some load * from other group and save more power */ - if (sum_nr_running > group_capacity - 1) + if (sgs.sum_nr_running > sgs.group_capacity - 1) goto group_next; - if (sum_nr_running > leader_nr_running || - (sum_nr_running == leader_nr_running && + if (sgs.sum_nr_running > leader_nr_running || + (sgs.sum_nr_running == leader_nr_running && group_first_cpu(group) < group_first_cpu(group_leader))) { group_leader = group; - leader_nr_running = sum_nr_running; + leader_nr_running = sgs.sum_nr_running; } group_next: #endif -- cgit v1.2.3 From 1f8c553d0f11d85f7993fe21015695d266771c00 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:51 +0530 Subject: sched: Create a helper function to calculate sched_group stats for fbg() Impact: cleanup Create a helper function named update_sg_lb_stats() which can be invoked to calculate the individual group's statistics in find_busiest_group(). This reduces the lenght of find_busiest_group() considerably. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Aked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091351.13992.43461.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 175 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 100 insertions(+), 75 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 109db122de5..1893d5562f5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3237,6 +3237,103 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } + + +/** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @group: sched_group whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @load_idx: Load index of sched_domain of this_cpu for load calc. + * @sd_idle: Idle status of the sched_domain containing group. + * @local_group: Does group contain this_cpu. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sgs: variable to hold the statistics for this group. + */ +static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, + enum cpu_idle_type idle, int load_idx, int *sd_idle, + int local_group, const struct cpumask *cpus, + int *balance, struct sg_lb_stats *sgs) +{ + unsigned long load, max_cpu_load, min_cpu_load; + int i; + unsigned int balance_cpu = -1, first_idle_cpu = 0; + unsigned long sum_avg_load_per_task; + unsigned long avg_load_per_task; + + if (local_group) + balance_cpu = group_first_cpu(group); + + /* Tally up the load of all CPUs in the group */ + sum_avg_load_per_task = avg_load_per_task = 0; + max_cpu_load = 0; + min_cpu_load = ~0UL; + + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); + + if (*sd_idle && rq->nr_running) + *sd_idle = 0; + + /* Bias balancing toward cpus of our domain */ + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + + load = target_load(i, load_idx); + } else { + load = source_load(i, load_idx); + if (load > max_cpu_load) + max_cpu_load = load; + if (min_cpu_load > load) + min_cpu_load = load; + } + + sgs->group_load += load; + sgs->sum_nr_running += rq->nr_running; + sgs->sum_weighted_load += weighted_cpuload(i); + + sum_avg_load_per_task += cpu_avg_load_per_task(i); + } + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (idle != CPU_NEWLY_IDLE && local_group && + balance_cpu != this_cpu && balance) { + *balance = 0; + return; + } + + /* Adjust by relative CPU power of the group */ + sgs->avg_load = sg_div_cpu_power(group, + sgs->group_load * SCHED_LOAD_SCALE); + + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of two tasks. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + avg_load_per_task = sg_div_cpu_power(group, + sum_avg_load_per_task * SCHED_LOAD_SCALE); + + if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) + sgs->group_imb = 1; + + sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; + +} /******* find_busiest_group() helpers end here *********************/ /* @@ -3270,92 +3367,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, do { struct sg_lb_stats sgs; - unsigned long load, max_cpu_load, min_cpu_load; int local_group; - int i; - unsigned int balance_cpu = -1, first_idle_cpu = 0; - unsigned long sum_avg_load_per_task; - unsigned long avg_load_per_task; local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); memset(&sgs, 0, sizeof(sgs)); + update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, + local_group, cpus, balance, &sgs); - if (local_group) - balance_cpu = group_first_cpu(group); - - /* Tally up the load of all CPUs in the group */ - sum_avg_load_per_task = avg_load_per_task = 0; - - max_cpu_load = 0; - min_cpu_load = ~0UL; - - for_each_cpu_and(i, sched_group_cpus(group), cpus) { - struct rq *rq = cpu_rq(i); - - if (*sd_idle && rq->nr_running) - *sd_idle = 0; - - /* Bias balancing toward cpus of our domain */ - if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { - first_idle_cpu = 1; - balance_cpu = i; - } - - load = target_load(i, load_idx); - } else { - load = source_load(i, load_idx); - if (load > max_cpu_load) - max_cpu_load = load; - if (min_cpu_load > load) - min_cpu_load = load; - } - - sgs.group_load += load; - sgs.sum_nr_running += rq->nr_running; - sgs.sum_weighted_load += weighted_cpuload(i); - - sum_avg_load_per_task += cpu_avg_load_per_task(i); - } - - /* - * First idle cpu or the first cpu(busiest) in this sched group - * is eligible for doing load balancing at this and above - * domains. In the newly idle case, we will allow all the cpu's - * to do the newly idle load balance. - */ - if (idle != CPU_NEWLY_IDLE && local_group && - balance_cpu != this_cpu && balance) { - *balance = 0; + if (balance && !(*balance)) goto ret; - } total_load += sgs.group_load; total_pwr += group->__cpu_power; - /* Adjust by relative CPU power of the group */ - sgs.avg_load = sg_div_cpu_power(group, - sgs.group_load * SCHED_LOAD_SCALE); - - - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of two tasks. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ - avg_load_per_task = sg_div_cpu_power(group, - sum_avg_load_per_task * SCHED_LOAD_SCALE); - - if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) - sgs.group_imb = 1; - - sgs.group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; - if (local_group) { this_load = sgs.avg_load; this = group; -- cgit v1.2.3 From 222d656dea57e4e084fbd1e9383e6fed2ca9fa61 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:43:56 +0530 Subject: sched: Define structure to store the sched_domain statistics for fbg() Impact: cleanup Currently we use a lot of local variables in find_busiest_group() to capture the various statistics related to the sched_domain. Group them together into a single data structure. This will help us to offload the job of updating the sched_domain statistics to a helper function. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091356.13992.25970.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 207 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 121 insertions(+), 86 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 1893d5562f5..8198dbe8e4a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3190,6 +3190,37 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } /********** Helpers for find_busiest_group ************************/ +/** + * sd_lb_stats - Structure to store the statistics of a sched_domain + * during load balancing. + */ +struct sd_lb_stats { + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *this; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + + /** Statistics of this group */ + unsigned long this_load; + unsigned long this_load_per_task; + unsigned long this_nr_running; + + /* Statistics of the busiest group */ + unsigned long max_load; + unsigned long busiest_load_per_task; + unsigned long busiest_nr_running; + + int group_imb; /* Is there imbalance in this sd */ +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + int power_savings_balance; /* Is powersave balance needed for this sd */ + struct sched_group *group_min; /* Least loaded group in sd */ + struct sched_group *group_leader; /* Group which relieves group_min */ + unsigned long min_load_per_task; /* load_per_task in group_min */ + unsigned long leader_nr_running; /* Nr running of group_leader */ + unsigned long min_nr_running; /* Nr running of group_min */ +#endif +}; /** * sg_lb_stats - stats of a sched_group required for load_balancing @@ -3346,23 +3377,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle, const struct cpumask *cpus, int *balance) { - struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; - unsigned long max_load, avg_load, total_load, this_load, total_pwr; + struct sd_lb_stats sds; + struct sched_group *group = sd->groups; unsigned long max_pull; - unsigned long busiest_load_per_task, busiest_nr_running; - unsigned long this_load_per_task, this_nr_running; - int load_idx, group_imb = 0; + int load_idx; + + memset(&sds, 0, sizeof(sds)); #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - int power_savings_balance = 1; - unsigned long leader_nr_running = 0, min_load_per_task = 0; - unsigned long min_nr_running = ULONG_MAX; - struct sched_group *group_min = NULL, *group_leader = NULL; + sds.power_savings_balance = 1; + sds.min_nr_running = ULONG_MAX; #endif - - max_load = this_load = total_load = total_pwr = 0; - busiest_load_per_task = busiest_nr_running = 0; - this_load_per_task = this_nr_running = 0; - load_idx = get_sd_load_idx(sd, idle); do { @@ -3378,22 +3402,22 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (balance && !(*balance)) goto ret; - total_load += sgs.group_load; - total_pwr += group->__cpu_power; + sds.total_load += sgs.group_load; + sds.total_pwr += group->__cpu_power; if (local_group) { - this_load = sgs.avg_load; - this = group; - this_nr_running = sgs.sum_nr_running; - this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > max_load && + sds.this_load = sgs.avg_load; + sds.this = group; + sds.this_nr_running = sgs.sum_nr_running; + sds.this_load_per_task = sgs.sum_weighted_load; + } else if (sgs.avg_load > sds.max_load && (sgs.sum_nr_running > sgs.group_capacity || sgs.group_imb)) { - max_load = sgs.avg_load; - busiest = group; - busiest_nr_running = sgs.sum_nr_running; - busiest_load_per_task = sgs.sum_weighted_load; - group_imb = sgs.group_imb; + sds.max_load = sgs.avg_load; + sds.busiest = group; + sds.busiest_nr_running = sgs.sum_nr_running; + sds.busiest_load_per_task = sgs.sum_weighted_load; + sds.group_imb = sgs.group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -3409,15 +3433,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * If the local group is idle or completely loaded * no need to do power savings balance at this domain */ - if (local_group && (this_nr_running >= sgs.group_capacity || - !this_nr_running)) - power_savings_balance = 0; + if (local_group && + (sds.this_nr_running >= sgs.group_capacity || + !sds.this_nr_running)) + sds.power_savings_balance = 0; /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations */ - if (!power_savings_balance || + if (!sds.power_savings_balance || sgs.sum_nr_running >= sgs.group_capacity || !sgs.sum_nr_running) goto group_next; @@ -3427,12 +3452,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * This is the group from where we need to pick up the load * for saving power */ - if ((sgs.sum_nr_running < min_nr_running) || - (sgs.sum_nr_running == min_nr_running && - group_first_cpu(group) > group_first_cpu(group_min))) { - group_min = group; - min_nr_running = sgs.sum_nr_running; - min_load_per_task = sgs.sum_weighted_load / + if ((sgs.sum_nr_running < sds.min_nr_running) || + (sgs.sum_nr_running == sds.min_nr_running && + group_first_cpu(group) > + group_first_cpu(sds.group_min))) { + sds.group_min = group; + sds.min_nr_running = sgs.sum_nr_running; + sds.min_load_per_task = sgs.sum_weighted_load / sgs.sum_nr_running; } @@ -3444,29 +3470,32 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sgs.sum_nr_running > sgs.group_capacity - 1) goto group_next; - if (sgs.sum_nr_running > leader_nr_running || - (sgs.sum_nr_running == leader_nr_running && - group_first_cpu(group) < group_first_cpu(group_leader))) { - group_leader = group; - leader_nr_running = sgs.sum_nr_running; + if (sgs.sum_nr_running > sds.leader_nr_running || + (sgs.sum_nr_running == sds.leader_nr_running && + group_first_cpu(group) < + group_first_cpu(sds.group_leader))) { + sds.group_leader = group; + sds.leader_nr_running = sgs.sum_nr_running; } group_next: #endif group = group->next; } while (group != sd->groups); - if (!busiest || this_load >= max_load || busiest_nr_running == 0) + if (!sds.busiest || sds.this_load >= sds.max_load + || sds.busiest_nr_running == 0) goto out_balanced; - avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; + sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (this_load >= avg_load || - 100*max_load <= sd->imbalance_pct*this_load) + if (sds.this_load >= sds.avg_load || + 100*sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; - busiest_load_per_task /= busiest_nr_running; - if (group_imb) - busiest_load_per_task = min(busiest_load_per_task, avg_load); + sds.busiest_load_per_task /= sds.busiest_nr_running; + if (sds.group_imb) + sds.busiest_load_per_task = + min(sds.busiest_load_per_task, sds.avg_load); /* * We're trying to get all the cpus to the average_load, so we don't @@ -3479,7 +3508,7 @@ group_next: * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ - if (max_load <= busiest_load_per_task) + if (sds.max_load <= sds.busiest_load_per_task) goto out_balanced; /* @@ -3487,17 +3516,18 @@ group_next: * max load less than avg load(as we skip the groups at or below * its cpu_power, while calculating max_load..) */ - if (max_load < avg_load) { + if (sds.max_load < sds.avg_load) { *imbalance = 0; goto small_imbalance; } /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); + max_pull = min(sds.max_load - sds.avg_load, + sds.max_load - sds.busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * busiest->__cpu_power, - (avg_load - this_load) * this->__cpu_power) + *imbalance = min(max_pull * sds.busiest->__cpu_power, + (sds.avg_load - sds.this_load) * sds.this->__cpu_power) / SCHED_LOAD_SCALE; /* @@ -3506,24 +3536,27 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance < busiest_load_per_task) { + if (*imbalance < sds.busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; small_imbalance: pwr_move = pwr_now = 0; imbn = 2; - if (this_nr_running) { - this_load_per_task /= this_nr_running; - if (busiest_load_per_task > this_load_per_task) + if (sds.this_nr_running) { + sds.this_load_per_task /= sds.this_nr_running; + if (sds.busiest_load_per_task > + sds.this_load_per_task) imbn = 1; } else - this_load_per_task = cpu_avg_load_per_task(this_cpu); - - if (max_load - this_load + busiest_load_per_task >= - busiest_load_per_task * imbn) { - *imbalance = busiest_load_per_task; - return busiest; + sds.this_load_per_task = + cpu_avg_load_per_task(this_cpu); + + if (sds.max_load - sds.this_load + + sds.busiest_load_per_task >= + sds.busiest_load_per_task * imbn) { + *imbalance = sds.busiest_load_per_task; + return sds.busiest; } /* @@ -3532,52 +3565,54 @@ small_imbalance: * moving them. */ - pwr_now += busiest->__cpu_power * - min(busiest_load_per_task, max_load); - pwr_now += this->__cpu_power * - min(this_load_per_task, this_load); + pwr_now += sds.busiest->__cpu_power * + min(sds.busiest_load_per_task, sds.max_load); + pwr_now += sds.this->__cpu_power * + min(sds.this_load_per_task, sds.this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(busiest, - busiest_load_per_task * SCHED_LOAD_SCALE); - if (max_load > tmp) - pwr_move += busiest->__cpu_power * - min(busiest_load_per_task, max_load - tmp); + tmp = sg_div_cpu_power(sds.busiest, + sds.busiest_load_per_task * SCHED_LOAD_SCALE); + if (sds.max_load > tmp) + pwr_move += sds.busiest->__cpu_power * + min(sds.busiest_load_per_task, + sds.max_load - tmp); /* Amount of load we'd add */ - if (max_load * busiest->__cpu_power < - busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(this, - max_load * busiest->__cpu_power); + if (sds.max_load * sds.busiest->__cpu_power < + sds.busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = sg_div_cpu_power(sds.this, + sds.max_load * sds.busiest->__cpu_power); else - tmp = sg_div_cpu_power(this, - busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += this->__cpu_power * - min(this_load_per_task, this_load + tmp); + tmp = sg_div_cpu_power(sds.this, + sds.busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += sds.this->__cpu_power * + min(sds.this_load_per_task, + sds.this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) - *imbalance = busiest_load_per_task; + *imbalance = sds.busiest_load_per_task; } - return busiest; + return sds.busiest; out_balanced: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto ret; - if (this != group_leader || group_leader == group_min) + if (sds.this != sds.group_leader || sds.group_leader == sds.group_min) goto ret; - *imbalance = min_load_per_task; + *imbalance = sds.min_load_per_task; if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(group_leader); + group_first_cpu(sds.group_leader); } - return group_min; + return sds.group_min; #endif ret: -- cgit v1.2.3 From 37abe198b1246ddd206319c43502a687db62d347 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:01 +0530 Subject: sched: Create a helper function to calculate sched_domain stats for fbg() Impact: cleanup Create a helper function named update_sd_lb_stats() to update the various sched_domain related statistics in find_busiest_group(). With this we would have moved all the statistics computation out of find_busiest_group(). Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091401.13992.88737.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 117 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 44 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 8198dbe8e4a..ec715f97202 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3365,32 +3365,33 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; } -/******* find_busiest_group() helpers end here *********************/ -/* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which - * should be moved to restore balance via the imbalance parameter. +/** + * update_sd_lb_stats - Update sched_group's statistics for load balancing. + * @sd: sched_domain whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @sd_idle: Idle status of the sched_domain containing group. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sds: variable to hold the statistics for this sched_domain. */ -static struct sched_group * -find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum cpu_idle_type idle, - int *sd_idle, const struct cpumask *cpus, int *balance) +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, + enum cpu_idle_type idle, int *sd_idle, + const struct cpumask *cpus, int *balance, + struct sd_lb_stats *sds) { - struct sd_lb_stats sds; struct sched_group *group = sd->groups; - unsigned long max_pull; + struct sg_lb_stats sgs; int load_idx; - memset(&sds, 0, sizeof(sds)); #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - sds.power_savings_balance = 1; - sds.min_nr_running = ULONG_MAX; + sds->power_savings_balance = 1; + sds->min_nr_running = ULONG_MAX; #endif load_idx = get_sd_load_idx(sd, idle); do { - struct sg_lb_stats sgs; int local_group; local_group = cpumask_test_cpu(this_cpu, @@ -3399,25 +3400,25 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); - if (balance && !(*balance)) - goto ret; + if (local_group && balance && !(*balance)) + return; - sds.total_load += sgs.group_load; - sds.total_pwr += group->__cpu_power; + sds->total_load += sgs.group_load; + sds->total_pwr += group->__cpu_power; if (local_group) { - sds.this_load = sgs.avg_load; - sds.this = group; - sds.this_nr_running = sgs.sum_nr_running; - sds.this_load_per_task = sgs.sum_weighted_load; - } else if (sgs.avg_load > sds.max_load && + sds->this_load = sgs.avg_load; + sds->this = group; + sds->this_nr_running = sgs.sum_nr_running; + sds->this_load_per_task = sgs.sum_weighted_load; + } else if (sgs.avg_load > sds->max_load && (sgs.sum_nr_running > sgs.group_capacity || sgs.group_imb)) { - sds.max_load = sgs.avg_load; - sds.busiest = group; - sds.busiest_nr_running = sgs.sum_nr_running; - sds.busiest_load_per_task = sgs.sum_weighted_load; - sds.group_imb = sgs.group_imb; + sds->max_load = sgs.avg_load; + sds->busiest = group; + sds->busiest_nr_running = sgs.sum_nr_running; + sds->busiest_load_per_task = sgs.sum_weighted_load; + sds->group_imb = sgs.group_imb; } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -3434,15 +3435,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * no need to do power savings balance at this domain */ if (local_group && - (sds.this_nr_running >= sgs.group_capacity || - !sds.this_nr_running)) - sds.power_savings_balance = 0; + (sds->this_nr_running >= sgs.group_capacity || + !sds->this_nr_running)) + sds->power_savings_balance = 0; /* * If a group is already running at full capacity or idle, * don't include that group in power savings calculations */ - if (!sds.power_savings_balance || + if (!sds->power_savings_balance || sgs.sum_nr_running >= sgs.group_capacity || !sgs.sum_nr_running) goto group_next; @@ -3452,13 +3453,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * This is the group from where we need to pick up the load * for saving power */ - if ((sgs.sum_nr_running < sds.min_nr_running) || - (sgs.sum_nr_running == sds.min_nr_running && + if ((sgs.sum_nr_running < sds->min_nr_running) || + (sgs.sum_nr_running == sds->min_nr_running && group_first_cpu(group) > - group_first_cpu(sds.group_min))) { - sds.group_min = group; - sds.min_nr_running = sgs.sum_nr_running; - sds.min_load_per_task = sgs.sum_weighted_load / + group_first_cpu(sds->group_min))) { + sds->group_min = group; + sds->min_nr_running = sgs.sum_nr_running; + sds->min_load_per_task = sgs.sum_weighted_load / sgs.sum_nr_running; } @@ -3470,18 +3471,46 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sgs.sum_nr_running > sgs.group_capacity - 1) goto group_next; - if (sgs.sum_nr_running > sds.leader_nr_running || - (sgs.sum_nr_running == sds.leader_nr_running && + if (sgs.sum_nr_running > sds->leader_nr_running || + (sgs.sum_nr_running == sds->leader_nr_running && group_first_cpu(group) < - group_first_cpu(sds.group_leader))) { - sds.group_leader = group; - sds.leader_nr_running = sgs.sum_nr_running; + group_first_cpu(sds->group_leader))) { + sds->group_leader = group; + sds->leader_nr_running = sgs.sum_nr_running; } group_next: #endif group = group->next; } while (group != sd->groups); +} +/******* find_busiest_group() helpers end here *********************/ + +/* + * find_busiest_group finds and returns the busiest CPU group within the + * domain. It calculates and returns the amount of weighted load which + * should be moved to restore balance via the imbalance parameter. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum cpu_idle_type idle, + int *sd_idle, const struct cpumask *cpus, int *balance) +{ + struct sd_lb_stats sds; + unsigned long max_pull; + + memset(&sds, 0, sizeof(sds)); + + /* + * Compute the various statistics relavent for load balancing at + * this level. + */ + update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, + balance, &sds); + + if (balance && !(*balance)) + goto ret; + if (!sds.busiest || sds.this_load >= sds.max_load || sds.busiest_nr_running == 0) goto out_balanced; -- cgit v1.2.3 From 2e6f44aeda426054fc58464df1ad571aecca0c92 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:06 +0530 Subject: sched: Create helper to calculate small_imbalance in fbg() Impact: cleanup We have two places in find_busiest_group() where we need to calculate the minor imbalance before returning the busiest group. Encapsulate this functionality into a seperate helper function. Credit: Vaidyanathan Srinivasan Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao LKML-Reference: <20090325091406.13992.54316.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 131 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 70 insertions(+), 61 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index ec715f97202..540147e5e82 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3484,6 +3484,71 @@ group_next: } while (group != sd->groups); } + +/** + * fix_small_imbalance - Calculate the minor imbalance that exists + * amongst the groups of a sched_domain, during + * load balancing. + * @sds: Statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: Variable to store the imbalance. + */ +static inline void fix_small_imbalance(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned int imbn = 2; + + if (sds->this_nr_running) { + sds->this_load_per_task /= sds->this_nr_running; + if (sds->busiest_load_per_task > + sds->this_load_per_task) + imbn = 1; + } else + sds->this_load_per_task = + cpu_avg_load_per_task(this_cpu); + + if (sds->max_load - sds->this_load + sds->busiest_load_per_task >= + sds->busiest_load_per_task * imbn) { + *imbalance = sds->busiest_load_per_task; + return; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += sds->busiest->__cpu_power * + min(sds->busiest_load_per_task, sds->max_load); + pwr_now += sds->this->__cpu_power * + min(sds->this_load_per_task, sds->this_load); + pwr_now /= SCHED_LOAD_SCALE; + + /* Amount of load we'd subtract */ + tmp = sg_div_cpu_power(sds->busiest, + sds->busiest_load_per_task * SCHED_LOAD_SCALE); + if (sds->max_load > tmp) + pwr_move += sds->busiest->__cpu_power * + min(sds->busiest_load_per_task, sds->max_load - tmp); + + /* Amount of load we'd add */ + if (sds->max_load * sds->busiest->__cpu_power < + sds->busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = sg_div_cpu_power(sds->this, + sds->max_load * sds->busiest->__cpu_power); + else + tmp = sg_div_cpu_power(sds->this, + sds->busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += sds->this->__cpu_power * + min(sds->this_load_per_task, sds->this_load + tmp); + pwr_move /= SCHED_LOAD_SCALE; + + /* Move if we gain throughput */ + if (pwr_move > pwr_now) + *imbalance = sds->busiest_load_per_task; +} /******* find_busiest_group() helpers end here *********************/ /* @@ -3547,7 +3612,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, */ if (sds.max_load < sds.avg_load) { *imbalance = 0; - goto small_imbalance; + fix_small_imbalance(&sds, this_cpu, imbalance); + goto ret_busiest; } /* Don't want to pull so many tasks that a group would go idle */ @@ -3565,67 +3631,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance < sds.busiest_load_per_task) { - unsigned long tmp, pwr_now, pwr_move; - unsigned int imbn; - -small_imbalance: - pwr_move = pwr_now = 0; - imbn = 2; - if (sds.this_nr_running) { - sds.this_load_per_task /= sds.this_nr_running; - if (sds.busiest_load_per_task > - sds.this_load_per_task) - imbn = 1; - } else - sds.this_load_per_task = - cpu_avg_load_per_task(this_cpu); - - if (sds.max_load - sds.this_load + - sds.busiest_load_per_task >= - sds.busiest_load_per_task * imbn) { - *imbalance = sds.busiest_load_per_task; - return sds.busiest; - } - - /* - * OK, we don't have enough imbalance to justify moving tasks, - * however we may be able to increase total CPU power used by - * moving them. - */ - - pwr_now += sds.busiest->__cpu_power * - min(sds.busiest_load_per_task, sds.max_load); - pwr_now += sds.this->__cpu_power * - min(sds.this_load_per_task, sds.this_load); - pwr_now /= SCHED_LOAD_SCALE; - - /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(sds.busiest, - sds.busiest_load_per_task * SCHED_LOAD_SCALE); - if (sds.max_load > tmp) - pwr_move += sds.busiest->__cpu_power * - min(sds.busiest_load_per_task, - sds.max_load - tmp); - - /* Amount of load we'd add */ - if (sds.max_load * sds.busiest->__cpu_power < - sds.busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(sds.this, - sds.max_load * sds.busiest->__cpu_power); - else - tmp = sg_div_cpu_power(sds.this, - sds.busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += sds.this->__cpu_power * - min(sds.this_load_per_task, - sds.this_load + tmp); - pwr_move /= SCHED_LOAD_SCALE; - - /* Move if we gain throughput */ - if (pwr_move > pwr_now) - *imbalance = sds.busiest_load_per_task; - } + if (*imbalance < sds.busiest_load_per_task) + fix_small_imbalance(&sds, this_cpu, imbalance); +ret_busiest: return sds.busiest; out_balanced: -- cgit v1.2.3 From dbc523a3b86f9e1765b5e70e6886913b99cc5cec Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:12 +0530 Subject: sched: Create a helper function to calculate imbalance Move all the imbalance calculation out of find_busiest_group() through this helper function. With this change, the structure of find_busiest_group() will be as follows: - update_sched_domain_statistics. - check if imbalance exits. - update imbalance and return busiest. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091411.13992.43293.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 78 +++++++++++++++++++++++++++++++++------------------------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 540147e5e82..934f615ccce 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3487,8 +3487,8 @@ group_next: /** * fix_small_imbalance - Calculate the minor imbalance that exists - * amongst the groups of a sched_domain, during - * load balancing. + * amongst the groups of a sched_domain, during + * load balancing. * @sds: Statistics of the sched_domain whose imbalance is to be calculated. * @this_cpu: The cpu at whose sched_domain we're performing load-balance. * @imbalance: Variable to store the imbalance. @@ -3549,6 +3549,47 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, if (pwr_move > pwr_now) *imbalance = sds->busiest_load_per_task; } + +/** + * calculate_imbalance - Calculate the amount of imbalance present within the + * groups of a given sched_domain during load balance. + * @sds: statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: Cpu for which currently load balance is being performed. + * @imbalance: The variable to store the imbalance. + */ +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, + unsigned long *imbalance) +{ + unsigned long max_pull; + /* + * In the presence of smp nice balancing, certain scenarios can have + * max load less than avg load(as we skip the groups at or below + * its cpu_power, while calculating max_load..) + */ + if (sds->max_load < sds->avg_load) { + *imbalance = 0; + return fix_small_imbalance(sds, this_cpu, imbalance); + } + + /* Don't want to pull so many tasks that a group would go idle */ + max_pull = min(sds->max_load - sds->avg_load, + sds->max_load - sds->busiest_load_per_task); + + /* How much load to actually move to equalise the imbalance */ + *imbalance = min(max_pull * sds->busiest->__cpu_power, + (sds->avg_load - sds->this_load) * sds->this->__cpu_power) + / SCHED_LOAD_SCALE; + + /* + * if *imbalance is less than the average load per runnable task + * there is no gaurantee that any tasks will be moved so we'll have + * a think about bumping its value to force at least one task to be + * moved + */ + if (*imbalance < sds->busiest_load_per_task) + return fix_small_imbalance(sds, this_cpu, imbalance); + +} /******* find_busiest_group() helpers end here *********************/ /* @@ -3562,7 +3603,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, int *sd_idle, const struct cpumask *cpus, int *balance) { struct sd_lb_stats sds; - unsigned long max_pull; memset(&sds, 0, sizeof(sds)); @@ -3605,36 +3645,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (sds.max_load <= sds.busiest_load_per_task) goto out_balanced; - /* - * In the presence of smp nice balancing, certain scenarios can have - * max load less than avg load(as we skip the groups at or below - * its cpu_power, while calculating max_load..) - */ - if (sds.max_load < sds.avg_load) { - *imbalance = 0; - fix_small_imbalance(&sds, this_cpu, imbalance); - goto ret_busiest; - } - - /* Don't want to pull so many tasks that a group would go idle */ - max_pull = min(sds.max_load - sds.avg_load, - sds.max_load - sds.busiest_load_per_task); - - /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds.busiest->__cpu_power, - (sds.avg_load - sds.this_load) * sds.this->__cpu_power) - / SCHED_LOAD_SCALE; - - /* - * if *imbalance is less than the average load per runnable task - * there is no gaurantee that any tasks will be moved so we'll have - * a think about bumping its value to force at least one task to be - * moved - */ - if (*imbalance < sds.busiest_load_per_task) - fix_small_imbalance(&sds, this_cpu, imbalance); - -ret_busiest: + /* Looks like there is an imbalance. Compute it */ + calculate_imbalance(&sds, this_cpu, imbalance); return sds.busiest; out_balanced: -- cgit v1.2.3 From a021dc03376707c55a3483e32c16b8986d4414cc Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:17 +0530 Subject: sched: Optimize the !power_savings_balance during fbg() Impact: cleanup, micro-optimization We don't need to perform power_savings balance if either the cpu is NOT_IDLE or if the sched_domain doesn't contain the SD_POWERSAVINGS_BALANCE flag set. Currently, we check for these conditions multiple number of times, even though these variables don't change over the scope of find_busiest_group(). Check once, and store the value in the already exiting "power_savings_balance" variable. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091417.13992.2657.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 934f615ccce..71e8dcaf2c7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3386,8 +3386,17 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, int load_idx; #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + sds->power_savings_balance = 0; + else { + sds->power_savings_balance = 1; + sds->min_nr_running = ULONG_MAX; + sds->leader_nr_running = 0; + } #endif load_idx = get_sd_load_idx(sd, idle); @@ -3422,12 +3431,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || - !(sd->flags & SD_POWERSAVINGS_BALANCE)) + + if (!sds->power_savings_balance) goto group_next; /* @@ -3651,7 +3656,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, out_balanced: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + if (!sds.power_savings_balance) goto ret; if (sds.this != sds.group_leader || sds.group_leader == sds.group_min) -- cgit v1.2.3 From c071df18525a95b37dd5821a6dc4af83bd18675e Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:22 +0530 Subject: sched: Refactor the power savings balance code Impact: cleanup Create seperate helper functions to initialize the power-savings-balance related variables, to update them and to check if we have a scope for performing power-savings balance. Add no-op inline functions for the !(CONFIG_SCHED_MC || CONFIG_SCHED_SMT) case. This will eliminate all the #ifdef jungle in find_busiest_group() and the other helper functions. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091422.13992.73616.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 236 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 153 insertions(+), 83 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 71e8dcaf2c7..5f21658b0f6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3270,6 +3270,151 @@ static inline int get_sd_load_idx(struct sched_domain *sd, } +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * init_sd_power_savings_stats - Initialize power savings statistics for + * the given sched_domain, during load balancing. + * + * @sd: Sched domain whose power-savings statistics are to be initialized. + * @sds: Variable containing the statistics for sd. + * @idle: Idle status of the CPU at which we're performing load-balancing. + */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, + struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + sds->power_savings_balance = 0; + else { + sds->power_savings_balance = 1; + sds->min_nr_running = ULONG_MAX; + sds->leader_nr_running = 0; + } +} + +/** + * update_sd_power_savings_stats - Update the power saving stats for a + * sched_domain while performing load balancing. + * + * @group: sched_group belonging to the sched_domain under consideration. + * @sds: Variable containing the statistics of the sched_domain + * @local_group: Does group contain the CPU for which we're performing + * load balancing ? + * @sgs: Variable containing the statistics of the group. + */ +static inline void update_sd_power_savings_stats(struct sched_group *group, + struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ + + if (!sds->power_savings_balance) + return; + + /* + * If the local group is idle or completely loaded + * no need to do power savings balance at this domain + */ + if (local_group && (sds->this_nr_running >= sgs->group_capacity || + !sds->this_nr_running)) + sds->power_savings_balance = 0; + + /* + * If a group is already running at full capacity or idle, + * don't include that group in power savings calculations + */ + if (!sds->power_savings_balance || + sgs->sum_nr_running >= sgs->group_capacity || + !sgs->sum_nr_running) + return; + + /* + * Calculate the group which has the least non-idle load. + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sgs->sum_nr_running < sds->min_nr_running) || + (sgs->sum_nr_running == sds->min_nr_running && + group_first_cpu(group) > group_first_cpu(sds->group_min))) { + sds->group_min = group; + sds->min_nr_running = sgs->sum_nr_running; + sds->min_load_per_task = sgs->sum_weighted_load / + sgs->sum_nr_running; + } + + /* + * Calculate the group which is almost near its + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sgs->sum_nr_running > sgs->group_capacity - 1) + return; + + if (sgs->sum_nr_running > sds->leader_nr_running || + (sgs->sum_nr_running == sds->leader_nr_running && + group_first_cpu(group) < group_first_cpu(sds->group_leader))) { + sds->group_leader = group; + sds->leader_nr_running = sgs->sum_nr_running; + } +} + +/** + * check_power_save_busiest_group - Check if we have potential to perform + * some power-savings balance. If yes, set the busiest group to be + * the least loaded group in the sched_domain, so that it's CPUs can + * be put to idle. + * + * @sds: Variable containing the statistics of the sched_domain + * under consideration. + * @this_cpu: Cpu at which we're currently performing load-balancing. + * @imbalance: Variable to store the imbalance. + * + * Returns 1 if there is potential to perform power-savings balance. + * Else returns 0. + */ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + if (!sds->power_savings_balance) + return 0; + + if (sds->this != sds->group_leader || + sds->group_leader == sds->group_min) + return 0; + + *imbalance = sds->min_load_per_task; + sds->busiest = sds->group_min; + + if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { + cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = + group_first_cpu(sds->group_leader); + } + + return 1; + +} +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, + struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ + return; +} + +static inline void update_sd_power_savings_stats(struct sched_group *group, + struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ + return; +} + +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + return 0; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @group: sched_group whose statistics are to be updated. @@ -3385,19 +3530,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, struct sg_lb_stats sgs; int load_idx; -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - /* - * Busy processors will not participate in power savings - * balance. - */ - if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) - sds->power_savings_balance = 0; - else { - sds->power_savings_balance = 1; - sds->min_nr_running = ULONG_MAX; - sds->leader_nr_running = 0; - } -#endif + init_sd_power_savings_stats(sd, sds, idle); load_idx = get_sd_load_idx(sd, idle); do { @@ -3430,61 +3563,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->group_imb = sgs.group_imb; } -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - - if (!sds->power_savings_balance) - goto group_next; - - /* - * If the local group is idle or completely loaded - * no need to do power savings balance at this domain - */ - if (local_group && - (sds->this_nr_running >= sgs.group_capacity || - !sds->this_nr_running)) - sds->power_savings_balance = 0; - - /* - * If a group is already running at full capacity or idle, - * don't include that group in power savings calculations - */ - if (!sds->power_savings_balance || - sgs.sum_nr_running >= sgs.group_capacity || - !sgs.sum_nr_running) - goto group_next; - - /* - * Calculate the group which has the least non-idle load. - * This is the group from where we need to pick up the load - * for saving power - */ - if ((sgs.sum_nr_running < sds->min_nr_running) || - (sgs.sum_nr_running == sds->min_nr_running && - group_first_cpu(group) > - group_first_cpu(sds->group_min))) { - sds->group_min = group; - sds->min_nr_running = sgs.sum_nr_running; - sds->min_load_per_task = sgs.sum_weighted_load / - sgs.sum_nr_running; - } - - /* - * Calculate the group which is almost near its - * capacity but still has some space to pick up some load - * from other group and save more power - */ - if (sgs.sum_nr_running > sgs.group_capacity - 1) - goto group_next; - - if (sgs.sum_nr_running > sds->leader_nr_running || - (sgs.sum_nr_running == sds->leader_nr_running && - group_first_cpu(group) < - group_first_cpu(sds->group_leader))) { - sds->group_leader = group; - sds->leader_nr_running = sgs.sum_nr_running; - } -group_next: -#endif + update_sd_power_savings_stats(group, sds, local_group, &sgs); group = group->next; } while (group != sd->groups); @@ -3655,21 +3734,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, return sds.busiest; out_balanced: -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (!sds.power_savings_balance) - goto ret; - - if (sds.this != sds.group_leader || sds.group_leader == sds.group_min) - goto ret; - - *imbalance = sds.min_load_per_task; - if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { - cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = - group_first_cpu(sds.group_leader); - } - return sds.group_min; - -#endif + /* + * There is no obvious imbalance. But check if we can do some balancing + * to save power. + */ + if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) + return sds.busiest; ret: *imbalance = 0; return NULL; -- cgit v1.2.3 From b7bb4c9bb01941fe8feb653f3410e7ed0c9bb786 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 25 Mar 2009 14:44:27 +0530 Subject: sched: Add comments to find_busiest_group() function Impact: cleanup Add /** style comments around find_busiest_group(). Also add a few explanatory comments. This concludes the find_busiest_group() cleanup. The function is now down to 72 lines from the original 313 lines. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra Cc: Suresh Siddha Cc: "Balbir Singh" Cc: Nick Piggin Cc: "Dhaval Giani" Cc: Bharata B Rao Cc: "Vaidyanathan Srinivasan" LKML-Reference: <20090325091427.13992.18933.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 50 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5f21658b0f6..9f8506d68fd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3676,10 +3676,30 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, } /******* find_busiest_group() helpers end here *********************/ -/* - * find_busiest_group finds and returns the busiest CPU group within the - * domain. It calculates and returns the amount of weighted load which - * should be moved to restore balance via the imbalance parameter. +/** + * find_busiest_group - Returns the busiest group within the sched_domain + * if there is an imbalance. If there isn't an imbalance, and + * the user has opted for power-savings, it returns a group whose + * CPUs can be put to idle by rebalancing those tasks elsewhere, if + * such a group exists. + * + * Also calculates the amount of weighted load which should be moved + * to restore balance. + * + * @sd: The sched_domain whose busiest group is to be returned. + * @this_cpu: The cpu for which load balancing is currently being performed. + * @imbalance: Variable which stores amount of weighted load which should + * be moved to restore balance/put a group to idle. + * @idle: The idle status of this_cpu. + * @sd_idle: The idleness of sd + * @cpus: The set of CPUs under consideration for load-balancing. + * @balance: Pointer to a variable indicating if this_cpu + * is the appropriate cpu to perform load balancing at this_level. + * + * Returns: - the busiest group if imbalance exists. + * - If no imbalance and user has opted for power-savings balance, + * return the least loaded group whose CPUs can be + * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, @@ -3697,17 +3717,31 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus, balance, &sds); + /* Cases where imbalance does not exist from POV of this_cpu */ + /* 1) this_cpu is not the appropriate cpu to perform load balancing + * at this level. + * 2) There is no busy sibling group to pull from. + * 3) This group is the busiest group. + * 4) This group is more busy than the avg busieness at this + * sched_domain. + * 5) The imbalance is within the specified limit. + * 6) Any rebalance would lead to ping-pong + */ if (balance && !(*balance)) goto ret; - if (!sds.busiest || sds.this_load >= sds.max_load - || sds.busiest_nr_running == 0) + if (!sds.busiest || sds.busiest_nr_running == 0) + goto out_balanced; + + if (sds.this_load >= sds.max_load) goto out_balanced; sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr; - if (sds.this_load >= sds.avg_load || - 100*sds.max_load <= sd->imbalance_pct * sds.this_load) + if (sds.this_load >= sds.avg_load) + goto out_balanced; + + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) goto out_balanced; sds.busiest_load_per_task /= sds.busiest_nr_running; -- cgit v1.2.3