/*
 * find_busiest_group finds and returns the busiest CPU group within the
 * domain. It calculates and returns the amount of weighted load which
 * should be moved to restore balance via the imbalance parameter.
 */
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum cpu_idle_type idle,
                   int *sd_idle, const struct cpumask *cpus, int *balance)
{
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
        unsigned long max_pull;
        unsigned long busiest_load_per_task, busiest_nr_running;
        unsigned long this_load_per_task, this_nr_running;
        int load_idx, group_imb = 0;
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
        int power_savings_balance = 1;
        unsigned long leader_nr_running = 0, min_load_per_task = 0;
        unsigned long min_nr_running = ULONG_MAX;
        struct sched_group *group_min = NULL, *group_leader = NULL;
#endif

        max_load = this_load = total_load = total_pwr = 0;
        busiest_load_per_task = busiest_nr_running = 0;
        this_load_per_task = this_nr_running = 0;

        if (idle == CPU_NOT_IDLE)
                load_idx = sd->busy_idx;
        else if (idle == CPU_NEWLY_IDLE)
                load_idx = sd->newidle_idx;
        else
                load_idx = sd->idle_idx;

        do {
                unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
                int local_group;
                int i;
                int __group_imb = 0;
                unsigned int balance_cpu = -1, first_idle_cpu = 0;
                unsigned long sum_nr_running, sum_weighted_load;
                unsigned long sum_avg_load_per_task;
                unsigned long avg_load_per_task;

                local_group = cpumask_test_cpu(this_cpu,
                                               sched_group_cpus(group));

                if (local_group)
                        balance_cpu = cpumask_first(sched_group_cpus(group));

                /* Tally up the load of all CPUs in the group */
                sum_weighted_load = sum_nr_running = avg_load = 0;
                sum_avg_load_per_task = avg_load_per_task = 0;

                max_cpu_load = 0;
                min_cpu_load = ~0UL;

                for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                        struct rq *rq = cpu_rq(i);

                        if (*sd_idle && rq->nr_running)
                                *sd_idle = 0;

                        /* Bias balancing toward cpus of our domain */
                        if (local_group) {
                                if (idle_cpu(i) && !first_idle_cpu) {
                                        first_idle_cpu = 1;
                                        balance_cpu = i;
                                }

                                load = target_load(i, load_idx);
                        } else {
                                load = source_load(i, load_idx);
                                if (load > max_cpu_load)
                                        max_cpu_load = load;
                                if (min_cpu_load > load)
                                        min_cpu_load = load;
                        }

                        avg_load += load;
                        sum_nr_running += rq->nr_running;
                        sum_weighted_load += weighted_cpuload(i);

                        sum_avg_load_per_task += cpu_avg_load_per_task(i);
                }

                /*
                 * First idle cpu or the first cpu(busiest) in this sched group
                 * is eligible for doing load balancing at this and above
                 * domains. In the newly idle case, we will allow all the cpu's
                 * to do the newly idle load balance.
                 */
                if (idle != CPU_NEWLY_IDLE && local_group &&
                    balance_cpu != this_cpu && balance) {
                        *balance = 0;
                        goto ret;
                }

                total_load += avg_load;
                total_pwr += group->__cpu_power;

                /* Adjust by relative CPU power of the group */
                avg_load = sg_div_cpu_power(group,
                                avg_load * SCHED_LOAD_SCALE);


                /*
                 * Consider the group unbalanced when the imbalance is larger
                 * than the average weight of two tasks.
                 *
                 * APZ: with cgroup the avg task weight can vary wildly and
                 *      might not be a suitable number - should we keep a
                 *      normalized nr_running number somewhere that negates
                 *      the hierarchy?
                 */
                avg_load_per_task = sg_div_cpu_power(group,
                                sum_avg_load_per_task * SCHED_LOAD_SCALE);

                if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                        __group_imb = 1;

                group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;

                if (local_group) {
                        this_load = avg_load;
                        this = group;
                        this_nr_running = sum_nr_running;
                        this_load_per_task = sum_weighted_load;
                } else if (avg_load > max_load &&
                           (sum_nr_running > group_capacity || __group_imb)) {
                        max_load = avg_load;
                        busiest = group;
                        busiest_nr_running = sum_nr_running;
                        busiest_load_per_task = sum_weighted_load;
                        group_imb = __group_imb;
                }

#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
                /*
                 * Busy processors will not participate in power savings
                 * balance.
                 */
                if (idle == CPU_NOT_IDLE ||
                                !(sd->flags & SD_POWERSAVINGS_BALANCE))
                        goto group_next;

                /*
                 * If the local group is idle or completely loaded
                 * no need to do power savings balance at this domain
                 */
                if (local_group && (this_nr_running >= group_capacity ||
                                    !this_nr_running))
                        power_savings_balance = 0;

                /*
                 * If a group is already running at full capacity or idle,
                 * don't include that group in power savings calculations
                 */
                if (!power_savings_balance || sum_nr_running >= group_capacity
                    || !sum_nr_running)
                        goto group_next;

                /*
                 * Calculate the group which has the least non-idle load.
                 * This is the group from where we need to pick up the load
                 * for saving power
                 */
                if ((sum_nr_running < min_nr_running) ||
                    (sum_nr_running == min_nr_running &&
                     cpumask_first(sched_group_cpus(group)) >
                     cpumask_first(sched_group_cpus(group_min)))) {
                        group_min = group;
                        min_nr_running = sum_nr_running;
                        min_load_per_task = sum_weighted_load /
                                                sum_nr_running;
                }

                /*
                 * Calculate the group which is almost near its
                 * capacity but still has some space to pick up some load
                 * from other group and save more power
                 */
                if (sum_nr_running <= group_capacity - 1) {
                        if (sum_nr_running > leader_nr_running ||
                            (sum_nr_running == leader_nr_running &&
                             cpumask_first(sched_group_cpus(group)) <
                             cpumask_first(sched_group_cpus(group_leader)))) {
                                group_leader = group;
                                leader_nr_running = sum_nr_running;
                        }
                }
group_next:
#endif
                group = group->next;
        } while (group != sd->groups);

        if (!busiest || this_load >= max_load || busiest_nr_running == 0)
                goto out_balanced;

        avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;

        if (this_load >= avg_load ||
                        100*max_load <= sd->imbalance_pct*this_load)
                goto out_balanced;

        busiest_load_per_task /= busiest_nr_running;
        if (group_imb)
                busiest_load_per_task = min(busiest_load_per_task, avg_load);

        /*
         * We're trying to get all the cpus to the average_load, so we don't
         * want to push ourselves above the average load, nor do we wish to
         * reduce the max loaded cpu below the average load, as either of these
         * actions would just result in more rebalancing later, and ping-pong
         * tasks around. Thus we look for the minimum possible imbalance.
         * Negative imbalances (*we* are more loaded than anyone else) will
         * be counted as no imbalance for these purposes -- we can't fix that
         * by pulling tasks to us. Be careful of negative numbers as they'll
         * appear as very large values with unsigned longs.
         */
        if (max_load <= busiest_load_per_task)
                goto out_balanced;

        /*
         * In the presence of smp nice balancing, certain scenarios can have
         * max load less than avg load(as we skip the groups at or below
         * its cpu_power, while calculating max_load..)
         */
        if (max_load < avg_load) {
                *imbalance = 0;
                goto small_imbalance;
        }

        /* Don't want to pull so many tasks that a group would go idle */
        max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);

        /* How much load to actually move to equalise the imbalance */
        *imbalance = min(max_pull * busiest->__cpu_power,
                                (avg_load - this_load) * this->__cpu_power)
                        / SCHED_LOAD_SCALE;

        /*
         * if *imbalance is less than the average load per runnable task
         * there is no gaurantee that any tasks will be moved so we'll have
         * a think about bumping its value to force at least one task to be
         * moved
         */
        if (*imbalance < busiest_load_per_task) {
                unsigned long tmp, pwr_now, pwr_move;
                unsigned int imbn;

small_imbalance:
                pwr_move = pwr_now = 0;
                imbn = 2;
                if (this_nr_running) {
                        this_load_per_task /= this_nr_running;
                        if (busiest_load_per_task > this_load_per_task)
                                imbn = 1;
                } else
                        this_load_per_task = cpu_avg_load_per_task(this_cpu);

                if (max_load - this_load + busiest_load_per_task >=
                                        busiest_load_per_task * imbn) {
                        *imbalance = busiest_load_per_task;
                        return busiest;
                }

                /*
                 * OK, we don't have enough imbalance to justify moving tasks,
                 * however we may be able to increase total CPU power used by
                 * moving them.
                 */

                pwr_now += busiest->__cpu_power *
                                min(busiest_load_per_task, max_load);
                pwr_now += this->__cpu_power *
                                min(this_load_per_task, this_load);
                pwr_now /= SCHED_LOAD_SCALE;

                /* Amount of load we'd subtract */
                tmp = sg_div_cpu_power(busiest,
                                busiest_load_per_task * SCHED_LOAD_SCALE);
                if (max_load > tmp)
                        pwr_move += busiest->__cpu_power *
                                min(busiest_load_per_task, max_load - tmp);

                /* Amount of load we'd add */
                if (max_load * busiest->__cpu_power <
                                busiest_load_per_task * SCHED_LOAD_SCALE)
                        tmp = sg_div_cpu_power(this,
                                        max_load * busiest->__cpu_power);
                else
                        tmp = sg_div_cpu_power(this,
                                busiest_load_per_task * SCHED_LOAD_SCALE);
                pwr_move += this->__cpu_power *
                                min(this_load_per_task, this_load + tmp);
                pwr_move /= SCHED_LOAD_SCALE;

                /* Move if we gain throughput */
                if (pwr_move > pwr_now)
                        *imbalance = busiest_load_per_task;
        }

        return busiest;

out_balanced:
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
                goto ret;

        if (this == group_leader && group_leader != group_min) {
                *imbalance = min_load_per_task;
                if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
                        cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
                                cpumask_first(sched_group_cpus(group_leader));
                }
                return group_min;
        }
#endif
ret:
        *imbalance = 0;
        return NULL;
}