sched: Code commentary for the load balancer code. From: Gautham R Shenoy Add comments in the load balancer code to make it a bit easier to understand for a newbie. Signed-off-by: Gautham R Shenoy --- kernel/sched.c | 211 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 files changed, 202 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a..e167e06 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3255,16 +3255,48 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle, const cpumask_t *cpus, int *balance) { + /* + * |busiest|: is the sched_group to be returned. + * |this|: is the schedgroup that contains this_cpu. + */ struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; + /* |max_load|: is the load in the busiest group relative to + * it's __cpu_power + * |avg_load|: is used differently in different parts of the code. + * |total_load| is the sum of loads across all the groups of this sd. + * |this_load| is the load on |this| sched_group. + * |total_pwr| is the sum of __cpu_power of all the groups in the sd. + */ unsigned long max_load, avg_load, total_load, this_load, total_pwr; unsigned long max_pull; + /* + * |busiest_load_per_task| is load_per_task in the busiest group. + * |busiest_nr_running| is the number of running tasks in the busiest + * grp. + */ unsigned long busiest_load_per_task, busiest_nr_running; unsigned long this_load_per_task, this_nr_running; int load_idx, group_imb = 0; #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) int power_savings_balance = 1; + /* + * |leader_nr_running|: is the number of tasks running on the + * |group_leader| (see below) group. + * + * |min_load_per_task|: The load_per_task within the |group_min| + * (see below) group. + * + * |min_nr_running|: number of tasks running in group_min group. + */ unsigned long leader_nr_running = 0, min_load_per_task = 0; unsigned long min_nr_running = ULONG_MAX; + /* + * |group_min|: The group with the least amount of non-idle load + * which can be migrated to make this group idle. + * |group_leader|: The group which is near it's full capacity, + * but can accomodate a few more tasks so that it can make + * some other group idle. + */ struct sched_group *group_min = NULL, *group_leader = NULL; #endif @@ -3273,17 +3305,56 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, this_load_per_task = this_nr_running = 0; if (idle == CPU_NOT_IDLE) load_idx = sd->busy_idx; + /* + * When a cpu becomes newly idle, the scheduler tries to do + * see if it can pull in some tasks from it's siblings. + * We use the sd->newidle_idx in such case. + */ else if (idle == CPU_NEWLY_IDLE) load_idx = sd->newidle_idx; else load_idx = sd->idle_idx; + /* + * Here we enter the big loop where we iterate over the various + * sched groups in the sd domain. This loop computes the group's + * characteristics. + */ do { + /* + * |group_capacity|: The number of tasks corresponding to the + * group's __cpu_power. + * |max_cpu_load|: is the load on the heavily loaded cpu + * within the group. + * |min_cpu_load|: is the load on the least loaded cpu + * within the group. + */ unsigned long load, group_capacity, max_cpu_load, min_cpu_load; + /* + * |local_group|: this is a boolean to check if current + * group contains |this_cpu| + */ int local_group; int i; int __group_imb = 0; + /* + * |balance_cpu| is the cpu in the locali group + * which is going to do the load balancing. + * + * |balance_cpu| is the first idle cpu in the group or if + * there are no idle cpus, then it's the first cpu in + * the group. + * + * |first_idle_cpu| is a boolean which + * tells us whether we've found atleast + * one idle cpus in the local group. + */ unsigned int balance_cpu = -1, first_idle_cpu = 0; + /* + * sum_nr_running: total number of runnable tasks in the group. + * sum_weighted_load is the sum of the cpu weighted load + * of elements in the group. + */ unsigned long sum_nr_running, sum_weighted_load; local_group = cpu_isset(this_cpu, group->cpumask); @@ -3291,7 +3362,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, if (local_group) balance_cpu = first_cpu(group->cpumask); - /* Tally up the load of all CPUs in the group */ + /* + * Tally up the load of all CPUs in the group. + * Within this loop, + * sum_weighted_load = the sum of weighted_load + * of all the cpus in the group. + * sum_nr_running = Total number of tasks that are running + * on the group's cpus. + * avg_load = Sum(load of all group's cpus)/sum_nr_running + */ sum_weighted_load = sum_nr_running = avg_load = 0; max_cpu_load = 0; min_cpu_load = ~0UL; @@ -3304,16 +3383,31 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, rq = cpu_rq(i); + /* + * If we originally assumed that the domain is + * idle, but we have some group which still has + * some running tasks, take back our words about + * being idle. + */ if (*sd_idle && rq->nr_running) *sd_idle = 0; - /* Bias balancing toward cpus of our domain */ + /* + * Bias balancing toward cpus of our domain. + * + * If it's a local group, then we use target_load() + * which gives the higher estimate of the load on the + * cpu i. That's because we're looking at pulling tasks + * into this group. + * + * Otherwise, we use source_load(), since we're planning + * to pull from these groups. + */ if (local_group) { if (idle_cpu(i) && !first_idle_cpu) { first_idle_cpu = 1; balance_cpu = i; } - load = target_load(i, load_idx); } else { load = source_load(i, load_idx); @@ -3323,6 +3417,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, min_cpu_load = load; } + /* + * |avg_load| within this inner loop holds the + * sum of loads on all the cpus of the group. + */ avg_load += load; sum_nr_running += rq->nr_running; sum_weighted_load += weighted_cpuload(i); @@ -3333,6 +3431,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * is eligible for doing load balancing at this and above * domains. In the newly idle case, we will allow all the cpu's * to do the newly idle load balance. + * + * IOW, if we're not the first idle cpu, or we're not + * the first cpu when the whole group is busy, we don't do + * load balancing. However, if we're newly idle, we + * have to see if before going to idle, we can pull some tasks + * (hopefully) from our siblings and go back to busy mode. */ if (idle != CPU_NEWLY_IDLE && local_group && balance_cpu != this_cpu && balance) { @@ -3340,16 +3444,49 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, goto ret; } + /* + * The __cpu_power is nothing to do with the energy + * power. It represents the maximum amount of load + * a sched group can handle, when there are other + * idle sched groups within the same domain. + * it's always a multiple of SCHED_LOAD_SCALE. for + * smt and multicore cases, it's set to SCHED_LOAD_SCALE, + * which means that a cpu can have maximum 1 task when + * the others are idle. + * + * Is this something to ponder over for doing less loaded + * consolidation? + * + */ total_load += avg_load; total_pwr += group->__cpu_power; - /* Adjust by relative CPU power of the group */ + /* Adjust by relative CPU power of the group. + * + * |avg_load| here is the average load, with respect + * to the group's capacity. + * + * That would be (sum of load on all cpus)/group_capacity. + * And here, it's + * + * = avg_load/(group->__cpu_power/SCHED_LOAD_SCALE); + * = (avg_load * SCHED_LOAD_SCALE) / group->__cpu_power; + */ avg_load = sg_div_cpu_power(group, avg_load * SCHED_LOAD_SCALE); + /* + * If not local group, see if we're having an imbalance + * within the cpus of the group. + */ if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) __group_imb = 1; + /* + * Maximum number of tasks which a group can handle + * without feeling the pinch when the other sibling groups + * are idle. + */ group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) { @@ -3357,7 +3494,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, this = group; this_nr_running = sum_nr_running; this_load_per_task = sum_weighted_load; - } else if (avg_load > max_load && + } + /* + * For a group to qualify as busiest, not only should the + * avg_load be beyond the max_load encountered, but it + * should be running more tasks than it can handle, or + * should have an internal imbalance. + */ + else if (avg_load > max_load && (sum_nr_running > group_capacity || __group_imb)) { max_load = avg_load; busiest = group; @@ -3367,9 +3511,18 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/* + * Definition: Power Savings Balance: We try to load balance such that + * the tasks run on a fewer number of groups within a sched_domain + * by which we can keep some of the groups completely idle. + */ + /* * Busy processors will not participate in power savings - * balance. + * balance because, if we have come to this point, then either + * a) |this_cpu| is the first idle cpu in the local_group + * b) |this_cpu| is the busiest in this group. + * If (b), it doesn't make sense for us to pull any more tasks. */ if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) @@ -3377,7 +3530,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* * If the local group is idle or completely loaded - * no need to do power savings balance at this domain + * we don't need to do power savings balance at this domain + * from the local group's perspective. */ if (local_group && (this_nr_running >= group_capacity || !this_nr_running)) @@ -3425,6 +3579,10 @@ group_next: group = group->next; } while (group != sd->groups); + /* + * Question to ponder over: When busiest is non-null, can + * busiest_nr_running be 0 ? + */ if (!busiest || this_load >= max_load || busiest_nr_running == 0) goto out_balanced; @@ -3435,6 +3593,13 @@ group_next: goto out_balanced; busiest_load_per_task /= busiest_nr_running; + + /* + * If we called a group the busiest because it was having + * internal imbalance, then |busiest_load_per_task| should + * be considered from a minimum of the whole group's avg_load + * and the load_per_task for that group. + */ if (group_imb) busiest_load_per_task = min(busiest_load_per_task, avg_load); @@ -3462,10 +3627,24 @@ group_next: goto small_imbalance; } - /* Don't want to pull so many tasks that a group would go idle */ + /* + * Don't want to pull so many tasks that a group would go idle. + * We're trying to pull from the busiest group. How much + * we can pull is determined by the minimum of + * a) how far away is the busy group from the + * sched_domain's avg_load (max_load - avg_load) + * b) How much extra load should be migrated away from it so that + * it remains within it's capacity. + */ max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); - /* How much load to actually move to equalise the imbalance */ + /* + * How much load to actually move to equalise the imbalance. + * This is determined by the minimum of + * a) How much we can pull from the busiest group. + * b) How much we can accomodate so that we don't go beyond the + * avg_load + */ *imbalance = min(max_pull * busiest->__cpu_power, (avg_load - this_load) * this->__cpu_power) / SCHED_LOAD_SCALE; @@ -4019,7 +4198,21 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (!(sd->flags & SD_LOAD_BALANCE)) continue; + /* + * This is the interval which says how often should we + * rebalance. It changes dependent on how busy or idle + * the cpu has been. + */ interval = sd->balance_interval; + + /* If the cpu wasn't idle, mulitply by the appropriate + * busy factor. This will increase the length of the + * interval over which we check if lb is required. + * Greater the interval, lesser the load you pull. + * + * Question, can we change this busy_factor at runtime? + * There appears to be a sysctl table entry. + */ if (idle != CPU_IDLE) interval *= sd->busy_factor;