From 7def2be1dc679984f4c4fb3ef19a8a081b2454ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Jun 2008 14:49:58 +0200 Subject: [PATCH] sched: fix hotplug cpus on ia64 Cliff Wickman wrote: > I built an ia64 kernel from Andrew's tree (2.6.26-rc2-mm1) > and get a very predictable hotplug cpu problem. > billberry1:/tmp/cpw # ./dis > disabled cpu 17 > enabled cpu 17 > billberry1:/tmp/cpw # ./dis > disabled cpu 17 > enabled cpu 17 > billberry1:/tmp/cpw # ./dis > > The script that disables the cpu always hangs (unkillable) > on the 3rd attempt. > > And a bit further: > The kstopmachine thread always sits on the run queue (real time) for about > 30 minutes before running. this fix solves some (but not all) issues between CPU hotplug and RT bandwidth throttling. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 15 +++++-- kernel/sched_rt.c | 109 +++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 115 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 727bdef7616..e9c24a12865 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7513,21 +7513,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) { + int cpu = (int)(long)hcpu; + switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: + disable_runtime(cpu_rq(cpu)); + /* fall-through */ + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); free_sched_domains(); return NOTIFY_OK; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: + enable_runtime(cpu_rq(cpu)); + /* fall-through */ + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: /* diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index eaa606071d5..8ae3416e0bb 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -286,6 +286,9 @@ static int balance_runtime(struct rt_rq *rt_rq) continue; spin_lock(&iter->rt_runtime_lock); + if (iter->rt_runtime == RUNTIME_INF) + goto next; + diff = iter->rt_runtime - iter->rt_time; if (diff > 0) { do_div(diff, weight); @@ -299,12 +302,105 @@ static int balance_runtime(struct rt_rq *rt_rq) break; } } +next: spin_unlock(&iter->rt_runtime_lock); } spin_unlock(&rt_b->rt_runtime_lock); return more; } + +static void __disable_runtime(struct rq *rq) +{ + struct root_domain *rd = rq->rd; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + for_each_leaf_rt_rq(rt_rq, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + s64 want; + int i; + + spin_lock(&rt_b->rt_runtime_lock); + spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_runtime == RUNTIME_INF || + rt_rq->rt_runtime == rt_b->rt_runtime) + goto balanced; + spin_unlock(&rt_rq->rt_runtime_lock); + + want = rt_b->rt_runtime - rt_rq->rt_runtime; + + for_each_cpu_mask(i, rd->span) { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); + s64 diff; + + if (iter == rt_rq) + continue; + + spin_lock(&iter->rt_runtime_lock); + if (want > 0) { + diff = min_t(s64, iter->rt_runtime, want); + iter->rt_runtime -= diff; + want -= diff; + } else { + iter->rt_runtime -= want; + want -= want; + } + spin_unlock(&iter->rt_runtime_lock); + + if (!want) + break; + } + + spin_lock(&rt_rq->rt_runtime_lock); + BUG_ON(want); +balanced: + rt_rq->rt_runtime = RUNTIME_INF; + spin_unlock(&rt_rq->rt_runtime_lock); + spin_unlock(&rt_b->rt_runtime_lock); + } +} + +static void disable_runtime(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __disable_runtime(rq); + spin_unlock_irqrestore(&rq->lock, flags); +} + +static void __enable_runtime(struct rq *rq) +{ + struct root_domain *rd = rq->rd; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + for_each_leaf_rt_rq(rt_rq, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + spin_lock(&rt_b->rt_runtime_lock); + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_b->rt_runtime; + rt_rq->rt_time = 0; + spin_unlock(&rt_rq->rt_runtime_lock); + spin_unlock(&rt_b->rt_runtime_lock); + } +} + +static void enable_runtime(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __enable_runtime(rq); + spin_unlock_irqrestore(&rq->lock, flags); +} + #endif static inline int rt_se_prio(struct sched_rt_entity *rt_se) @@ -334,14 +430,13 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) #ifdef CONFIG_SMP if (rt_rq->rt_time > runtime) { - int more; - spin_unlock(&rt_rq->rt_runtime_lock); - more = balance_runtime(rt_rq); + balance_runtime(rt_rq); spin_lock(&rt_rq->rt_runtime_lock); - if (more) - runtime = sched_rt_runtime(rt_rq); + runtime = sched_rt_runtime(rt_rq); + if (runtime == RUNTIME_INF) + return 0; } #endif @@ -1174,6 +1269,8 @@ static void rq_online_rt(struct rq *rq) if (rq->rt.overloaded) rt_set_overload(rq); + __enable_runtime(rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio); } @@ -1183,6 +1280,8 @@ static void rq_offline_rt(struct rq *rq) if (rq->rt.overloaded) rt_clear_overload(rq); + __disable_runtime(rq); + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); }