/*
* find_busiest_group finds and returns the busiest CPU group within the
* domain. It calculates and returns the amount of weighted load which
* should be moved to restore balance via the imbalance parameter.
*/
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
int *sd_idle, const struct cpumask *cpus, int *balance)
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
unsigned long max_pull;
unsigned long busiest_load_per_task, busiest_nr_running;
unsigned long this_load_per_task, this_nr_running;
int load_idx, group_imb = 0;
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
int power_savings_balance = 1;
unsigned long leader_nr_running = 0, min_load_per_task = 0;
unsigned long min_nr_running = ULONG_MAX;
struct sched_group *group_min = NULL, *group_leader = NULL;
#endif
max_load = this_load = total_load = total_pwr = 0;
busiest_load_per_task = busiest_nr_running = 0;
this_load_per_task = this_nr_running = 0;
if (idle == CPU_NOT_IDLE)
load_idx = sd->busy_idx;
else if (idle == CPU_NEWLY_IDLE)
load_idx = sd->newidle_idx;
else
load_idx = sd->idle_idx;
do {
unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
int local_group;
int i;
int __group_imb = 0;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long sum_nr_running, sum_weighted_load;
unsigned long sum_avg_load_per_task;
unsigned long avg_load_per_task;
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
if (local_group)
balance_cpu = cpumask_first(sched_group_cpus(group));
/* Tally up the load of all CPUs in the group */
sum_weighted_load = sum_nr_running = avg_load = 0;
sum_avg_load_per_task = avg_load_per_task = 0;
max_cpu_load = 0;
min_cpu_load = ~0UL;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
if (*sd_idle && rq->nr_running)
*sd_idle = 0;
/* Bias balancing toward cpus of our domain */
if (local_group) {
if (idle_cpu(i) && !first_idle_cpu) {
first_idle_cpu = 1;
balance_cpu = i;
}
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
if (load > max_cpu_load)
max_cpu_load = load;
if (min_cpu_load > load)
min_cpu_load = load;
}
avg_load += load;
sum_nr_running += rq->nr_running;
sum_weighted_load += weighted_cpuload(i);
sum_avg_load_per_task += cpu_avg_load_per_task(i);
}
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above
* domains. In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
if (idle != CPU_NEWLY_IDLE && local_group &&
balance_cpu != this_cpu && balance) {
*balance = 0;
goto ret;
}
total_load += avg_load;
total_pwr += group->__cpu_power;
/* Adjust by relative CPU power of the group */
avg_load = sg_div_cpu_power(group,
avg_load * SCHED_LOAD_SCALE);
/*
* Consider the group unbalanced when the imbalance is larger
* than the average weight of two tasks.
*
* APZ: with cgroup the avg task weight can vary wildly and
* might not be a suitable number - should we keep a
* normalized nr_running number somewhere that negates
* the hierarchy?
*/
avg_load_per_task = sg_div_cpu_power(group,
sum_avg_load_per_task * SCHED_LOAD_SCALE);
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
__group_imb = 1;
group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
if (local_group) {
this_load = avg_load;
this = group;
this_nr_running = sum_nr_running;
this_load_per_task = sum_weighted_load;
} else if (avg_load > max_load &&
(sum_nr_running > group_capacity || __group_imb)) {
max_load = avg_load;
busiest = group;
busiest_nr_running = sum_nr_running;
busiest_load_per_task = sum_weighted_load;
group_imb = __group_imb;
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
/*
* Busy processors will not participate in power savings
* balance.
*/
if (idle == CPU_NOT_IDLE ||
!(sd->flags & SD_POWERSAVINGS_BALANCE))
goto group_next;
/*
* If the local group is idle or completely loaded
* no need to do power savings balance at this domain
*/
if (local_group && (this_nr_running >= group_capacity ||
!this_nr_running))
power_savings_balance = 0;
/*
* If a group is already running at full capacity or idle,
* don't include that group in power savings calculations
*/
if (!power_savings_balance || sum_nr_running >= group_capacity
|| !sum_nr_running)
goto group_next;
/*
* Calculate the group which has the least non-idle load.
* This is the group from where we need to pick up the load
* for saving power
*/
if ((sum_nr_running < min_nr_running) ||
(sum_nr_running == min_nr_running &&
cpumask_first(sched_group_cpus(group)) >
cpumask_first(sched_group_cpus(group_min)))) {
group_min = group;
min_nr_running = sum_nr_running;
min_load_per_task = sum_weighted_load /
sum_nr_running;
}
/*
* Calculate the group which is almost near its
* capacity but still has some space to pick up some load
* from other group and save more power
*/
if (sum_nr_running <= group_capacity - 1) {
if (sum_nr_running > leader_nr_running ||
(sum_nr_running == leader_nr_running &&
cpumask_first(sched_group_cpus(group)) <
cpumask_first(sched_group_cpus(group_leader)))) {
group_leader = group;
leader_nr_running = sum_nr_running;
}
}
group_next:
#endif
group = group->next;
} while (group != sd->groups);
if (!busiest || this_load >= max_load || busiest_nr_running == 0)
goto out_balanced;
avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
if (this_load >= avg_load ||
100*max_load <= sd->imbalance_pct*this_load)
goto out_balanced;
busiest_load_per_task /= busiest_nr_running;
if (group_imb)
busiest_load_per_task = min(busiest_load_per_task, avg_load);
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded cpu below the average load, as either of these
* actions would just result in more rebalancing later, and ping-pong
* tasks around. Thus we look for the minimum possible imbalance.
* Negative imbalances (*we* are more loaded than anyone else) will
* be counted as no imbalance for these purposes -- we can't fix that
* by pulling tasks to us. Be careful of negative numbers as they'll
* appear as very large values with unsigned longs.
*/
if (max_load <= busiest_load_per_task)
goto out_balanced;
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
* its cpu_power, while calculating max_load..)
*/
if (max_load < avg_load) {
*imbalance = 0;
goto small_imbalance;
}
/* Don't want to pull so many tasks that a group would go idle */
max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
/* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * busiest->__cpu_power,
(avg_load - this_load) * this->__cpu_power)
/ SCHED_LOAD_SCALE;
/*
* if *imbalance is less than the average load per runnable task
* there is no gaurantee that any tasks will be moved so we'll have
* a think about bumping its value to force at least one task to be
* moved
*/
if (*imbalance < busiest_load_per_task) {
unsigned long tmp, pwr_now, pwr_move;
unsigned int imbn;
small_imbalance:
pwr_move = pwr_now = 0;
imbn = 2;
if (this_nr_running) {
this_load_per_task /= this_nr_running;
if (busiest_load_per_task > this_load_per_task)
imbn = 1;
} else
this_load_per_task = cpu_avg_load_per_task(this_cpu);
if (max_load - this_load + busiest_load_per_task >=
busiest_load_per_task * imbn) {
*imbalance = busiest_load_per_task;
return busiest;
}
/*
* OK, we don't have enough imbalance to justify moving tasks,
* however we may be able to increase total CPU power used by
* moving them.
*/
pwr_now += busiest->__cpu_power *
min(busiest_load_per_task, max_load);
pwr_now += this->__cpu_power *
min(this_load_per_task, this_load);
pwr_now /= SCHED_LOAD_SCALE;
/* Amount of load we'd subtract */
tmp = sg_div_cpu_power(busiest,
busiest_load_per_task * SCHED_LOAD_SCALE);
if (max_load > tmp)
pwr_move += busiest->__cpu_power *
min(busiest_load_per_task, max_load - tmp);
/* Amount of load we'd add */
if (max_load * busiest->__cpu_power <
busiest_load_per_task * SCHED_LOAD_SCALE)
tmp = sg_div_cpu_power(this,
max_load * busiest->__cpu_power);
else
tmp = sg_div_cpu_power(this,
busiest_load_per_task * SCHED_LOAD_SCALE);
pwr_move += this->__cpu_power *
min(this_load_per_task, this_load + tmp);
pwr_move /= SCHED_LOAD_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
*imbalance = busiest_load_per_task;
}
return busiest;
out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
goto ret;
if (this == group_leader && group_leader != group_min) {
*imbalance = min_load_per_task;
if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
cpumask_first(sched_group_cpus(group_leader));
}
return group_min;
}
#endif
ret:
*imbalance = 0;
return NULL;
}