/********** Helpers for find_busiest_group ************************/
/**
 * sd_lb_stats - Structure to store the statistics of a sched_domain
 *              during load balancing.
 */
struct sd_lb_stats {
        struct sched_group *busiest; /* Busiest group in this sd */
        struct sched_group *this;  /* Local group in this sd */
        unsigned long total_load;  /* Total load of all groups in sd */
        unsigned long total_pwr;   /*  Total power of all groups in sd */
        unsigned long avg_load;           /* Average load across all groups in sd */

        /** Statistics of this group */
        unsigned long this_load;
        unsigned long this_load_per_task;
        unsigned long this_nr_running;

        /* Statistics of the busiest group */
        unsigned long max_load;
        unsigned long busiest_load_per_task;
        unsigned long busiest_nr_running;

        int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
        int power_savings_balance; /* Is powersave balance needed for this sd */
        struct sched_group *group_min; /* Least loaded group in sd */
        struct sched_group *group_leader; /* Group which relieves group_min */
        unsigned long min_load_per_task; /* load_per_task in group_min */
        unsigned long leader_nr_running; /* Nr running of group_leader */
        unsigned long min_nr_running; /* Nr running of group_min */
#endif
};

/**
 * sg_lb_stats - stats of a sched_group required for load_balancing
 */
struct sg_lb_stats {
        unsigned long avg_load; /*Avg load across the CPUs of the group */
        unsigned long group_load; /* Total load over the CPUs of the group */
        unsigned long sum_nr_running; /* Nr tasks running in the group */
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
        unsigned long group_capacity;
        int group_imb; /* Is there an imbalance in the group ? */
};

/**
 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
 * @group: The group whose first cpu is to be returned.
 */
static inline unsigned int group_first_cpu(struct sched_group *group)
{
        return cpumask_first(sched_group_cpus(group));
}

/**
 * get_sd_load_idx - Obtain the load index for a given sched domain.
 * @sd: The sched_domain whose load_idx is to be obtained.
 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
 */
static inline int get_sd_load_idx(struct sched_domain *sd,
                                        enum cpu_idle_type idle)
{
        int load_idx;

        switch (idle) {
        case CPU_NOT_IDLE:
                load_idx = sd->busy_idx;
                break;

        case CPU_NEWLY_IDLE:
                load_idx = sd->newidle_idx;
                break;
        default:
                load_idx = sd->idle_idx;
                break;
        }

        return load_idx;
}


#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
/**
 * init_sd_power_savings_stats - Initialize power savings statistics for
 * the given sched_domain, during load balancing.
 *
 * @sd: Sched domain whose power-savings statistics are to be initialized.
 * @sds: Variable containing the statistics for sd.
 * @idle: Idle status of the CPU at which we're performing load-balancing.
 */
static inline void init_sd_power_savings_stats(struct sched_domain *sd,
        struct sd_lb_stats *sds, enum cpu_idle_type idle)
{
        /*
         * Busy processors will not participate in power savings
         * balance.
         */
        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
                sds->power_savings_balance = 0;
        else {
                sds->power_savings_balance = 1;
                sds->min_nr_running = ULONG_MAX;
                sds->leader_nr_running = 0;
        }
}

/**
 * update_sd_power_savings_stats - Update the power saving stats for a
 * sched_domain while performing load balancing.
 *
 * @group: sched_group belonging to the sched_domain under consideration.
 * @sds: Variable containing the statistics of the sched_domain
 * @local_group: Does group contain the CPU for which we're performing
 *              load balancing ?
 * @sgs: Variable containing the statistics of the group.
 */
static inline void update_sd_power_savings_stats(struct sched_group *group,
        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
{

        if (!sds->power_savings_balance)
                return;

        /*
         * If the local group is idle or completely loaded
         * no need to do power savings balance at this domain
         */
        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
                                !sds->this_nr_running))
                sds->power_savings_balance = 0;

        /*
         * If a group is already running at full capacity or idle,
         * don't include that group in power savings calculations
         */
        if (!sds->power_savings_balance ||
                sgs->sum_nr_running >= sgs->group_capacity ||
                !sgs->sum_nr_running)
                return;

        /*
         * Calculate the group which has the least non-idle load.
         * This is the group from where we need to pick up the load
         * for saving power
         */
        if ((sgs->sum_nr_running < sds->min_nr_running) ||
            (sgs->sum_nr_running == sds->min_nr_running &&
             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
                sds->group_min = group;
                sds->min_nr_running = sgs->sum_nr_running;
                sds->min_load_per_task = sgs->sum_weighted_load /
                                                sgs->sum_nr_running;
        }

        /*
         * Calculate the group which is almost near its
         * capacity but still has some space to pick up some load
         * from other group and save more power
         */
        if (sgs->sum_nr_running > sgs->group_capacity - 1)
                return;

        if (sgs->sum_nr_running > sds->leader_nr_running ||
            (sgs->sum_nr_running == sds->leader_nr_running &&
             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
                sds->group_leader = group;
                sds->leader_nr_running = sgs->sum_nr_running;
        }
}

/**
 * check_power_save_busiest_group - Check if we have potential to perform
 *      some power-savings balance. If yes, set the busiest group to be
 *      the least loaded group in the sched_domain, so that it's CPUs can
 *      be put to idle.
 *
 * @sds: Variable containing the statistics of the sched_domain
 *      under consideration.
 * @this_cpu: Cpu at which we're currently performing load-balancing.
 * @imbalance: Variable to store the imbalance.
 *
 * Returns 1 if there is potential to perform power-savings balance.
 * Else returns 0.
 */
static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
                                        int this_cpu, unsigned long *imbalance)
{
        if (!sds->power_savings_balance)
                return 0;

        if (sds->this != sds->group_leader ||
                        sds->group_leader == sds->group_min)
                return 0;

        *imbalance = sds->min_load_per_task;
        sds->busiest = sds->group_min;

        if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
                cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
                        group_first_cpu(sds->group_leader);
        }

        return 1;

}
#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
static inline void init_sd_power_savings_stats(struct sched_domain *sd,
        struct sd_lb_stats *sds, enum cpu_idle_type idle)
{
        return;
}

static inline void update_sd_power_savings_stats(struct sched_group *group,
        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
{
        return;
}

static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
                                        int this_cpu, unsigned long *imbalance)
{
        return 0;
}
#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */


/**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 * @group: sched_group whose statistics are to be updated.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @sd_idle: Idle status of the sched_domain containing group.
 * @local_group: Does group contain this_cpu.
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sgs: variable to hold the statistics for this group.
 */
static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
                        int local_group, const struct cpumask *cpus,
                        int *balance, struct sg_lb_stats *sgs)
{
        unsigned long load, max_cpu_load, min_cpu_load;
        int i;
        unsigned int balance_cpu = -1, first_idle_cpu = 0;
        unsigned long sum_avg_load_per_task;
        unsigned long avg_load_per_task;

        if (local_group)
                balance_cpu = group_first_cpu(group);

        /* Tally up the load of all CPUs in the group */
        sum_avg_load_per_task = avg_load_per_task = 0;
        max_cpu_load = 0;
        min_cpu_load = ~0UL;

        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                struct rq *rq = cpu_rq(i);

                if (*sd_idle && rq->nr_running)
                        *sd_idle = 0;

                /* Bias balancing toward cpus of our domain */
                if (local_group) {
                        if (idle_cpu(i) && !first_idle_cpu) {
                                first_idle_cpu = 1;
                                balance_cpu = i;
                        }

                        load = target_load(i, load_idx);
                } else {
                        load = source_load(i, load_idx);
                        if (load > max_cpu_load)
                                max_cpu_load = load;
                        if (min_cpu_load > load)
                                min_cpu_load = load;
                }

                sgs->group_load += load;
                sgs->sum_nr_running += rq->nr_running;
                sgs->sum_weighted_load += weighted_cpuload(i);

                sum_avg_load_per_task += cpu_avg_load_per_task(i);
        }

        /*
         * First idle cpu or the first cpu(busiest) in this sched group
         * is eligible for doing load balancing at this and above
         * domains. In the newly idle case, we will allow all the cpu's
         * to do the newly idle load balance.
         */
        if (idle != CPU_NEWLY_IDLE && local_group &&
            balance_cpu != this_cpu && balance) {
                *balance = 0;
                return;
        }

        /* Adjust by relative CPU power of the group */
        sgs->avg_load = sg_div_cpu_power(group,
                        sgs->group_load * SCHED_LOAD_SCALE);


        /*
         * Consider the group unbalanced when the imbalance is larger
         * than the average weight of two tasks.
         *
         * APZ: with cgroup the avg task weight can vary wildly and
         *      might not be a suitable number - should we keep a
         *      normalized nr_running number somewhere that negates
         *      the hierarchy?
         */
        avg_load_per_task = sg_div_cpu_power(group,
                        sum_avg_load_per_task * SCHED_LOAD_SCALE);

        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
                sgs->group_imb = 1;

        sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;

}

/**
 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
 * @sd: sched_domain whose statistics are to be updated.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
 * @sd_idle: Idle status of the sched_domain containing group.
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        enum cpu_idle_type idle, int *sd_idle,
                        const struct cpumask *cpus, int *balance,
                        struct sd_lb_stats *sds)
{
        struct sched_group *group = sd->groups;
        struct sg_lb_stats sgs;
        int load_idx;

        init_sd_power_savings_stats(sd, sds, idle);
        load_idx = get_sd_load_idx(sd, idle);

        do {
                int local_group;

                local_group = cpumask_test_cpu(this_cpu,
                                               sched_group_cpus(group));
                memset(&sgs, 0, sizeof(sgs));
                update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
                                local_group, cpus, balance, &sgs);

                if (local_group && balance && !(*balance))
                        return;

                sds->total_load += sgs.group_load;
                sds->total_pwr += group->__cpu_power;

                if (local_group) {
                        sds->this_load = sgs.avg_load;
                        sds->this = group;
                        sds->this_nr_running = sgs.sum_nr_running;
                        sds->this_load_per_task = sgs.sum_weighted_load;
                } else if (sgs.avg_load > sds->max_load &&
                           (sgs.sum_nr_running > sgs.group_capacity ||
                                sgs.group_imb)) {
                        sds->max_load = sgs.avg_load;
                        sds->busiest = group;
                        sds->busiest_nr_running = sgs.sum_nr_running;
                        sds->busiest_load_per_task = sgs.sum_weighted_load;
                        sds->group_imb = sgs.group_imb;
                }

                update_sd_power_savings_stats(group, sds, local_group, &sgs);
                group = group->next;
        } while (group != sd->groups);

}

/**
 * fix_small_imbalance - Calculate the minor imbalance that exists
 *                      amongst the groups of a sched_domain, during
 *                      load balancing.
 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
 * @imbalance: Variable to store the imbalance.
 */
static inline void fix_small_imbalance(struct sd_lb_stats *sds,
                                int this_cpu, unsigned long *imbalance)
{
        unsigned long tmp, pwr_now = 0, pwr_move = 0;
        unsigned int imbn = 2;

        if (sds->this_nr_running) {
                sds->this_load_per_task /= sds->this_nr_running;
                if (sds->busiest_load_per_task >
                                sds->this_load_per_task)
                        imbn = 1;
        } else
                sds->this_load_per_task =
                        cpu_avg_load_per_task(this_cpu);

        if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
                        sds->busiest_load_per_task * imbn) {
                *imbalance = sds->busiest_load_per_task;
                return;
        }

        /*
         * OK, we don't have enough imbalance to justify moving tasks,
         * however we may be able to increase total CPU power used by
         * moving them.
         */

        pwr_now += sds->busiest->__cpu_power *
                        min(sds->busiest_load_per_task, sds->max_load);
        pwr_now += sds->this->__cpu_power *
                        min(sds->this_load_per_task, sds->this_load);
        pwr_now /= SCHED_LOAD_SCALE;

        /* Amount of load we'd subtract */
        tmp = sg_div_cpu_power(sds->busiest,
                        sds->busiest_load_per_task * SCHED_LOAD_SCALE);
        if (sds->max_load > tmp)
                pwr_move += sds->busiest->__cpu_power *
                        min(sds->busiest_load_per_task, sds->max_load - tmp);

        /* Amount of load we'd add */
        if (sds->max_load * sds->busiest->__cpu_power <
                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
                tmp = sg_div_cpu_power(sds->this,
                        sds->max_load * sds->busiest->__cpu_power);
        else
                tmp = sg_div_cpu_power(sds->this,
                        sds->busiest_load_per_task * SCHED_LOAD_SCALE);
        pwr_move += sds->this->__cpu_power *
                        min(sds->this_load_per_task, sds->this_load + tmp);
        pwr_move /= SCHED_LOAD_SCALE;

        /* Move if we gain throughput */
        if (pwr_move > pwr_now)
                *imbalance = sds->busiest_load_per_task;
}

/**
 * calculate_imbalance - Calculate the amount of imbalance present within the
 *                       groups of a given sched_domain during load balance.
 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
 * @this_cpu: Cpu for which currently load balance is being performed.
 * @imbalance: The variable to store the imbalance.
 */
static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
                unsigned long *imbalance)
{
        unsigned long max_pull;
        /*
         * In the presence of smp nice balancing, certain scenarios can have
         * max load less than avg load(as we skip the groups at or below
         * its cpu_power, while calculating max_load..)
         */
        if (sds->max_load < sds->avg_load) {
                *imbalance = 0;
                return fix_small_imbalance(sds, this_cpu, imbalance);
        }

        /* Don't want to pull so many tasks that a group would go idle */
        max_pull = min(sds->max_load - sds->avg_load,
                        sds->max_load - sds->busiest_load_per_task);

        /* How much load to actually move to equalise the imbalance */
        *imbalance = min(max_pull * sds->busiest->__cpu_power,
                (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
                        / SCHED_LOAD_SCALE;

        /*
         * if *imbalance is less than the average load per runnable task
         * there is no gaurantee that any tasks will be moved so we'll have
         * a think about bumping its value to force at least one task to be
         * moved
         */
        if (*imbalance < sds->busiest_load_per_task)
                return fix_small_imbalance(sds, this_cpu, imbalance);

}
/******* find_busiest_group() helpers end here *********************/

/**
 * find_busiest_group - Returns the busiest group within the sched_domain
 * if there is an imbalance. If there isn't an imbalance, and
 * the user has opted for power-savings, it returns a group whose
 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
 * such a group exists.
 *
 * Also calculates the amount of weighted load which should be moved
 * to restore balance.
 *
 * @sd: The sched_domain whose busiest group is to be returned.
 * @this_cpu: The cpu for which load balancing is currently being performed.
 * @imbalance: Variable which stores amount of weighted load which should
 *              be moved to restore balance/put a group to idle.
 * @idle: The idle status of this_cpu.
 * @sd_idle: The idleness of sd
 * @cpus: The set of CPUs under consideration for load-balancing.
 * @balance: Pointer to a variable indicating if this_cpu
 *      is the appropriate cpu to perform load balancing at this_level.
 *
 * Returns:     - the busiest group if imbalance exists.
 *              - If no imbalance and user has opted for power-savings balance,
 *                 return the least loaded group whose CPUs can be
 *                 put to idle by rebalancing its tasks onto our group.
 */
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
                   unsigned long *imbalance, enum cpu_idle_type idle,
                   int *sd_idle, const struct cpumask *cpus, int *balance)
{
        struct sd_lb_stats sds;

        memset(&sds, 0, sizeof(sds));

        /*
         * Compute the various statistics relavent for load balancing at
         * this level.
         */
        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
                                        balance, &sds);

        /* Cases where imbalance does not exist from POV of this_cpu */
        /* 1) this_cpu is not the appropriate cpu to perform load balancing
         *    at this level.
         * 2) There is no busy sibling group to pull from.
         * 3) This group is the busiest group.
         * 4) This group is more busy than the avg busieness at this
         *    sched_domain.
         * 5) The imbalance is within the specified limit.
         * 6) Any rebalance would lead to ping-pong
         */
        if (balance && !(*balance))
                goto ret;

        if (!sds.busiest || sds.busiest_nr_running == 0)
                goto out_balanced;

        if (sds.this_load >= sds.max_load)
                goto out_balanced;

        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;

        if (sds.this_load >= sds.avg_load)
                goto out_balanced;

        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
                goto out_balanced;

        sds.busiest_load_per_task /= sds.busiest_nr_running;
        if (sds.group_imb)
                sds.busiest_load_per_task =
                        min(sds.busiest_load_per_task, sds.avg_load);

        /*
         * We're trying to get all the cpus to the average_load, so we don't
         * want to push ourselves above the average load, nor do we wish to
         * reduce the max loaded cpu below the average load, as either of these
         * actions would just result in more rebalancing later, and ping-pong
         * tasks around. Thus we look for the minimum possible imbalance.
         * Negative imbalances (*we* are more loaded than anyone else) will
         * be counted as no imbalance for these purposes -- we can't fix that
         * by pulling tasks to us. Be careful of negative numbers as they'll
         * appear as very large values with unsigned longs.
         */
        if (sds.max_load <= sds.busiest_load_per_task)
                goto out_balanced;

        /* Looks like there is an imbalance. Compute it */
        calculate_imbalance(&sds, this_cpu, imbalance);
        return sds.busiest;

out_balanced:
        /*
         * There is no obvious imbalance. But check if we can do some balancing
         * to save power.
         */
        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
                return sds.busiest;
ret:
        *imbalance = 0;
        return NULL;
}