From ac086bc22997a2be24fc40fc8d46522fe7e03d11 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 19 Apr 2008 19:44:58 +0200 Subject: [PATCH] sched: rt-group: smp balancing Currently the rt group scheduling does a per cpu runtime limit, however the rt load balancer makes no guarantees about an equal spread of real- time tasks, just that at any one time, the highest priority tasks run. Solve this by making the runtime limit a global property by borrowing excessive runtime from the other cpus once the local limit runs out. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 40 +++++++++++++++++++-- kernel/sched_rt.c | 88 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 122 insertions(+), 6 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index bb20323f7d0..313cd4f057c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -164,6 +164,7 @@ struct rt_prio_array { struct rt_bandwidth { ktime_t rt_period; u64 rt_runtime; + spinlock_t rt_runtime_lock; struct hrtimer rt_period_timer; }; @@ -198,6 +199,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) rt_b->rt_period = ns_to_ktime(period); rt_b->rt_runtime = runtime; + spin_lock_init(&rt_b->rt_runtime_lock); + hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); rt_b->rt_period_timer.function = sched_rt_period_timer; @@ -414,6 +417,8 @@ struct rt_rq { #endif int rt_throttled; u64 rt_time; + u64 rt_runtime; + spinlock_t rt_runtime_lock; #ifdef CONFIG_RT_GROUP_SCHED unsigned long rt_nr_boosted; @@ -7299,6 +7304,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_time = 0; rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + spin_lock_init(&rt_rq->rt_runtime_lock); #ifdef CONFIG_RT_GROUP_SCHED rt_rq->rt_nr_boosted = 0; @@ -7335,6 +7342,7 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, init_rt_rq(rt_rq, rq); rt_rq->tg = tg; rt_rq->rt_se = rt_se; + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; if (add) list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); @@ -7391,6 +7399,8 @@ void __init sched_init(void) init_tg_rt_entry(rq, &init_task_group, &per_cpu(init_rt_rq, i), &per_cpu(init_sched_rt_entity, i), i, 1); +#else + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -7974,11 +7984,11 @@ static inline int tg_has_rt_tasks(struct task_group *tg) static int tg_set_bandwidth(struct task_group *tg, u64 rt_period, u64 rt_runtime) { - int err = 0; + int i, err = 0; mutex_lock(&rt_constraints_mutex); read_lock(&tasklist_lock); - if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { + if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { err = -EBUSY; goto unlock; } @@ -7986,8 +7996,19 @@ static int tg_set_bandwidth(struct task_group *tg, err = -EINVAL; goto unlock; } + + spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); unlock: read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); @@ -8052,6 +8073,19 @@ static int sched_rt_global_constraints(void) #else static int sched_rt_global_constraints(void) { + unsigned long flags; + int i; + + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + spin_unlock(&rt_rq->rt_runtime_lock); + } + spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + return 0; } #endif @@ -8168,7 +8202,7 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) #endif #ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, +static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, struct file *file, const char __user *userbuf, size_t nbytes, loff_t *unused_ppos) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8bc17613666..6928ded24da 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -62,7 +62,12 @@ static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) if (!rt_rq->tg) return RUNTIME_INF; - return rt_rq->tg->rt_bandwidth.rt_runtime; + return rt_rq->rt_runtime; +} + +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ + return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -145,11 +150,21 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; } +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ + return &rt_rq->tg->rt_bandwidth; +} + #else static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) { - return def_rt_bandwidth.rt_runtime; + return rt_rq->rt_runtime; +} + +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ + return ktime_to_ns(def_rt_bandwidth.rt_period); } #define for_each_leaf_rt_rq(rt_rq, rq) \ @@ -200,6 +215,11 @@ struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) return &cpu_rq(cpu)->rt; } +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ + return &def_rt_bandwidth; +} + #endif static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) @@ -218,8 +238,10 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) spin_lock(&rq->lock); if (rt_rq->rt_time) { - u64 runtime = rt_b->rt_runtime; + u64 runtime; + spin_lock(&rt_rq->rt_runtime_lock); + runtime = rt_rq->rt_runtime; rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { rt_rq->rt_throttled = 0; @@ -227,6 +249,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) } if (rt_rq->rt_time || rt_rq->rt_nr_running) idle = 0; + spin_unlock(&rt_rq->rt_runtime_lock); } if (enqueue) @@ -237,6 +260,47 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) return idle; } +#ifdef CONFIG_SMP +static int balance_runtime(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int i, weight, more = 0; + u64 rt_period; + + weight = cpus_weight(rd->span); + + spin_lock(&rt_b->rt_runtime_lock); + rt_period = ktime_to_ns(rt_b->rt_period); + for_each_cpu_mask(i, rd->span) { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); + s64 diff; + + if (iter == rt_rq) + continue; + + spin_lock(&iter->rt_runtime_lock); + diff = iter->rt_runtime - iter->rt_time; + if (diff > 0) { + do_div(diff, weight); + if (rt_rq->rt_runtime + diff > rt_period) + diff = rt_period - rt_rq->rt_runtime; + iter->rt_runtime -= diff; + rt_rq->rt_runtime += diff; + more = 1; + if (rt_rq->rt_runtime == rt_period) { + spin_unlock(&iter->rt_runtime_lock); + break; + } + } + spin_unlock(&iter->rt_runtime_lock); + } + spin_unlock(&rt_b->rt_runtime_lock); + + return more; +} +#endif + static inline int rt_se_prio(struct sched_rt_entity *rt_se) { #ifdef CONFIG_RT_GROUP_SCHED @@ -259,6 +323,22 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (rt_rq->rt_throttled) return rt_rq_throttled(rt_rq); + if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) + return 0; + +#ifdef CONFIG_SMP + if (rt_rq->rt_time > runtime) { + int more; + + spin_unlock(&rt_rq->rt_runtime_lock); + more = balance_runtime(rt_rq); + spin_lock(&rt_rq->rt_runtime_lock); + + if (more) + runtime = sched_rt_runtime(rt_rq); + } +#endif + if (rt_rq->rt_time > runtime) { rt_rq->rt_throttled = 1; if (rt_rq_throttled(rt_rq)) { @@ -294,9 +374,11 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); + spin_lock(&rt_rq->rt_runtime_lock); rt_rq->rt_time += delta_exec; if (sched_rt_runtime_exceeded(rt_rq)) resched_task(curr); + spin_unlock(&rt_rq->rt_runtime_lock); } static inline