sched: optimize group load balancer

I noticed that tg_shares_up() unconditionally takes rq-locks for all cpus
in the sched_domain. This hurts.

We need the rq-locks whenever we change the weight of the per-cpu group sched
entities. To allevate this a little, only change the weight when the new
weight is at least shares_thresh away from the old value.

This avoids the rq-lock for the top level entries, since those will never
be re-weighted, and fuzzes the lower level entries a little to gain performance
in semi-stable situations.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Peter Zijlstra 2008-10-17 19:27:02 +02:00 committed by Ingo Molnar
parent b0aa51b999
commit ffda12a17a
3 changed files with 36 additions and 20 deletions

View file

@ -1621,6 +1621,7 @@ extern unsigned int sysctl_sched_features;
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_shares_ratelimit;
extern unsigned int sysctl_sched_shares_thresh;
int sched_nr_latency_handler(struct ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length,

View file

@ -817,6 +817,13 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
*/
unsigned int sysctl_sched_shares_ratelimit = 250000;
/*
* Inject some fuzzyness into changing the per-cpu group shares
* this avoids remote rq-locks at the expense of fairness.
* default: 4
*/
unsigned int sysctl_sched_shares_thresh = 4;
/*
* period over which we measure -rt task cpu usage in us.
* default: 1s
@ -1453,8 +1460,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
* Calculate and set the cpu's group shares.
*/
static void
__update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long sd_shares, unsigned long sd_rq_weight)
update_group_shares_cpu(struct task_group *tg, int cpu,
unsigned long sd_shares, unsigned long sd_rq_weight)
{
int boost = 0;
unsigned long shares;
@ -1485,19 +1492,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
*
*/
shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
/*
* record the actual number of shares, not the boosted amount.
*/
tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
tg->cfs_rq[cpu]->rq_weight = rq_weight;
if (abs(shares - tg->se[cpu]->load.weight) >
sysctl_sched_shares_thresh) {
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
if (shares < MIN_SHARES)
shares = MIN_SHARES;
else if (shares > MAX_SHARES)
shares = MAX_SHARES;
spin_lock_irqsave(&rq->lock, flags);
/*
* record the actual number of shares, not the boosted amount.
*/
tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
tg->cfs_rq[cpu]->rq_weight = rq_weight;
__set_se_shares(tg->se[cpu], shares);
__set_se_shares(tg->se[cpu], shares);
spin_unlock_irqrestore(&rq->lock, flags);
}
}
/*
@ -1526,14 +1537,8 @@ static int tg_shares_up(struct task_group *tg, void *data)
if (!rq_weight)
rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
for_each_cpu_mask(i, sd->span) {
struct rq *rq = cpu_rq(i);
unsigned long flags;
spin_lock_irqsave(&rq->lock, flags);
__update_group_shares_cpu(tg, i, shares, rq_weight);
spin_unlock_irqrestore(&rq->lock, flags);
}
for_each_cpu_mask(i, sd->span)
update_group_shares_cpu(tg, i, shares, rq_weight);
return 0;
}

View file

@ -274,6 +274,16 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_shares_thresh",
.data = &sysctl_sched_shares_thresh,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.strategy = &sysctl_intvec,
.extra1 = &zero,
},
{
.ctl_name = CTL_UNNUMBERED,
.procname = "sched_child_runs_first",