From ca94c442535a44d508c99a77e54f21a59f4fc462 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Mon, 15 Jun 2009 17:17:47 +0200 Subject: [PATCH 01/64] sched: Introduce SCHED_RESET_ON_FORK scheduling policy flag This patch introduces a new flag SCHED_RESET_ON_FORK which can be passed to the kernel via sched_setscheduler(), ORed in the policy parameter. If set this will make sure that when the process forks a) the scheduling priority is reset to DEFAULT_PRIO if it was higher and b) the scheduling policy is reset to SCHED_NORMAL if it was either SCHED_FIFO or SCHED_RR. Why have this? Currently, if a process is real-time scheduled this will 'leak' to all its child processes. For security reasons it is often (always?) a good idea to make sure that if a process acquires RT scheduling this is confined to this process and only this process. More specifically this makes the per-process resource limit RLIMIT_RTTIME useful for security purposes, because it makes it impossible to use a fork bomb to circumvent the per-process RLIMIT_RTTIME accounting. This feature is also useful for tools like 'renice' which can then change the nice level of a process without having this spill to all its child processes. Why expose this via sched_setscheduler() and not other syscalls such as prctl() or sched_setparam()? prctl() does not take a pid parameter. Due to that it would be impossible to modify this flag for other processes than the current one. The struct passed to sched_setparam() can unfortunately not be extended without breaking compatibility, since sched_setparam() lacks a size parameter. How to use this from userspace? In your RT program simply replace this: sched_setscheduler(pid, SCHED_FIFO, ¶m); by this: sched_setscheduler(pid, SCHED_FIFO|SCHED_RESET_ON_FORK, ¶m); Signed-off-by: Lennart Poettering Acked-by: Peter Zijlstra LKML-Reference: <20090615152714.GA29092@tango.0pointer.de> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 6 ++++++ kernel/sched.c | 49 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4896fdfec91..d4a2c6662f7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -38,6 +38,8 @@ #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 +/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ +#define SCHED_RESET_ON_FORK 0x40000000 #ifdef __KERNEL__ @@ -1209,6 +1211,10 @@ struct task_struct { unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ + + /* Revert to default priority/policy when forking */ + unsigned sched_reset_on_fork:1; + pid_t pid; pid_t tgid; diff --git a/kernel/sched.c b/kernel/sched.c index 8ec9d13140b..32e6ede8525 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,12 +2613,28 @@ void sched_fork(struct task_struct *p, int clone_flags) set_task_cpu(p, cpu); /* - * Make sure we do not leak PI boosting priority to the child: + * Revert to default priority/policy on fork if requested. Make sure we + * do not leak PI boosting priority to the child. */ - p->prio = current->normal_prio; + if (current->sched_reset_on_fork && + (p->policy == SCHED_FIFO || p->policy == SCHED_RR)) + p->policy = SCHED_NORMAL; + + if (current->sched_reset_on_fork && + (current->normal_prio < DEFAULT_PRIO)) + p->prio = DEFAULT_PRIO; + else + p->prio = current->normal_prio; + if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -6094,17 +6110,25 @@ static int __sched_setscheduler(struct task_struct *p, int policy, unsigned long flags; const struct sched_class *prev_class = p->sched_class; struct rq *rq; + int reset_on_fork; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); recheck: /* double check policy once rq lock held */ - if (policy < 0) + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; policy = oldpolicy = p->policy; - else if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) - return -EINVAL; + } else { + reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); + policy &= ~SCHED_RESET_ON_FORK; + + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + } + /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, @@ -6148,6 +6172,10 @@ recheck: /* can't change other user's priorities */ if (!check_same_owner(p)) return -EPERM; + + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; } if (user) { @@ -6191,6 +6219,8 @@ recheck: if (running) p->sched_class->put_prev_task(rq, p); + p->sched_reset_on_fork = reset_on_fork; + oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); @@ -6307,14 +6337,15 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (p) { retval = security_task_getscheduler(p); if (!retval) - retval = p->policy; + retval = p->policy + | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); } read_unlock(&tasklist_lock); return retval; } /** - * sys_sched_getscheduler - get the RT priority of a thread + * sys_sched_getparam - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. */ From b9dc29e72fd3dc2a739ce8eafd958220d0745734 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 17 Jun 2009 10:46:01 +0200 Subject: [PATCH 02/64] sched: Clean up SCHED_RESET_ON_FORK Make SCHED_RESET_ON_FORK sched_fork() bits a self-contained unlikely code path. Signed-off-by: Mike Galbraith Acked-by: Lennart Poettering Cc: Peter Zijlstra LKML-Reference: <1245228361.18329.6.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 32e6ede8525..50e4e3d15e8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2613,28 +2613,30 @@ void sched_fork(struct task_struct *p, int clone_flags) set_task_cpu(p, cpu); /* - * Revert to default priority/policy on fork if requested. Make sure we - * do not leak PI boosting priority to the child. + * Make sure we do not leak PI boosting priority to the child. */ - if (current->sched_reset_on_fork && - (p->policy == SCHED_FIFO || p->policy == SCHED_RR)) - p->policy = SCHED_NORMAL; + p->prio = current->normal_prio; - if (current->sched_reset_on_fork && - (current->normal_prio < DEFAULT_PRIO)) - p->prio = DEFAULT_PRIO; - else - p->prio = current->normal_prio; + /* + * Revert to default priority/policy on fork if requested. + */ + if (unlikely(p->sched_reset_on_fork)) { + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) + p->policy = SCHED_NORMAL; + + if (p->normal_prio < DEFAULT_PRIO) + p->prio = DEFAULT_PRIO; + + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + } if (!rt_prio(p->prio)) p->sched_class = &fair_sched_class; - /* - * We don't need the reset flag anymore after the fork. It has - * fulfilled its duty: - */ - p->sched_reset_on_fork = 0; - #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); From 6c697bdf08a09ce461e305a22362973036e95db3 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 17 Jun 2009 10:48:02 +0200 Subject: [PATCH 03/64] sched: Add SCHED_RESET_ON_FORK functionality for nice < 0 tasks Signed-off-by: Mike Galbraith Acked-by: Lennart Poettering Cc: Peter Zijlstra LKML-Reference: <1245228482.27326.1.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 50e4e3d15e8..34f94240642 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2627,6 +2627,11 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->normal_prio < DEFAULT_PRIO) p->prio = DEFAULT_PRIO; + if (PRIO_TO_NICE(p->static_prio) < 0) { + p->static_prio = NICE_TO_PRIO(0); + set_load_weight(p); + } + /* * We don't need the reset flag anymore after the fork. It has * fulfilled its duty: From 54d35f29f49224d86b994acb6e5969b9ba09022d Mon Sep 17 00:00:00 2001 From: Hitoshi Mitake Date: Mon, 29 Jun 2009 14:44:57 +0900 Subject: [PATCH 04/64] sched: Hide runqueues from direct reference at source code level for __raw_get_cpu_var() Hide __raw_get_cpu_var() as well - thus all the direct references to runqueues will abstracted out. Signed-off-by: Hitoshi Mitake LKML-Reference: <20090629.144457.886429910353660979.mitake@dcl.info.waseda.ac.jp> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 168b2680ae2..ebc5151f88c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -692,6 +692,7 @@ static inline int cpu_of(struct rq *rq) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) inline void update_rq_clock(struct rq *rq) { @@ -6669,7 +6670,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); @@ -6681,7 +6682,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct rq *rq = &__raw_get_cpu_var(runqueues); + struct rq *rq = raw_rq(); long ret; delayacct_blkio_start(); From e7aaaa6934636d7a6cadd9e2a05250fbb6a34f65 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: [PATCH 05/64] sched: Drop the need_resched() loop from cond_resched() The schedule() function is a loop that reschedules the current task while the TIF_NEED_RESCHED flag is set: void schedule(void) { need_resched: /* schedule code */ if (need_resched()) goto need_resched; } And cond_resched() repeat this loop: do { add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); } while(need_resched()); This loop is needless because schedule() already did the check and nothing can set TIF_NEED_RESCHED between schedule() exit and the loop check in need_resched(). Then remove this needless loop. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 03f7e3fd80b..4c5ee843d57 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6618,11 +6618,9 @@ static void __cond_resched(void) * PREEMPT_ACTIVE, which could trigger a second * cond_resched() call. */ - do { - add_preempt_count(PREEMPT_ACTIVE); - schedule(); - sub_preempt_count(PREEMPT_ACTIVE); - } while (need_resched()); + add_preempt_count(PREEMPT_ACTIVE); + schedule(); + sub_preempt_count(PREEMPT_ACTIVE); } int __sched _cond_resched(void) From 4b2155678d7cc7b5f45d6b36049091376c3408a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: [PATCH 06/64] sched: Remove obsolete comment in __cond_resched() Remove the outdated comment from __cond_resched() related to the now removed Big Kernel Semaphore. Reported-by: Arnd Bergmann Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 4c5ee843d57..4d39e96044b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6613,11 +6613,6 @@ static void __cond_resched(void) #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP __might_sleep(__FILE__, __LINE__); #endif - /* - * The BKS might be reacquired before we have dropped - * PREEMPT_ACTIVE, which could trigger a second - * cond_resched() call. - */ add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); From e09758fae8ccde97e026c704319eaa18d488dc86 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: [PATCH 07/64] sched: Cover the CONFIG_DEBUG_SPINLOCK_SLEEP off-case for __might_sleep() Cover the off case for __might_sleep(), so that we avoid #ifdefs in files that make use of it. Especially, this prepares for the __might_sleep() pull up on cond_resched(). Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 1 + kernel/sched.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d6320a3e8de..b804f694ae6 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -139,6 +139,7 @@ extern int _cond_resched(void); # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) #else + static inline void __might_sleep(char *file, int line) { } # define might_sleep() do { might_resched(); } while (0) #endif diff --git a/kernel/sched.c b/kernel/sched.c index 4d39e96044b..370a6c31c5e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,9 +6610,8 @@ static inline int should_resched(void) static void __cond_resched(void) { -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP __might_sleep(__FILE__, __LINE__); -#endif + add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); From e4aafea2d4bde8b44d6500c4ee7195bbfc51269e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: [PATCH 08/64] sched: Add a preempt count base offset to __might_sleep() Add a preempt count base offset to compare against the current preempt level count. It prepares to pull up the might_sleep check from cond_resched() to cond_resched_lock() and cond_resched_bh(). For these two helpers, we need to respectively ensure that once we'll unlock the given spinlock / reenable local softirqs, we will reach a sleepable state. Signed-off-by: Frederic Weisbecker [ Move and rename preempt_count_equals() ] Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 6 +++--- kernel/sched.c | 15 +++++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index b804f694ae6..2b5b1e0899a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -125,7 +125,7 @@ extern int _cond_resched(void); #endif #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP - void __might_sleep(char *file, int line); + void __might_sleep(char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep * @@ -137,9 +137,9 @@ extern int _cond_resched(void); * supposed to. */ # define might_sleep() \ - do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) + do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) #else - static inline void __might_sleep(char *file, int line) { } + static inline void __might_sleep(char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) #endif diff --git a/kernel/sched.c b/kernel/sched.c index 370a6c31c5e..3ff4d004bd9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,7 +6610,7 @@ static inline int should_resched(void) static void __cond_resched(void) { - __might_sleep(__FILE__, __LINE__); + __might_sleep(__FILE__, __LINE__, 0); add_preempt_count(PREEMPT_ACTIVE); schedule(); @@ -9429,13 +9429,20 @@ void __init sched_init(void) } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -void __might_sleep(char *file, int line) +static inline int preempt_count_equals(int preempt_offset) +{ + int nested = preempt_count() & ~PREEMPT_ACTIVE; + + return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); +} + +void __might_sleep(char *file, int line, int preempt_offset) { #ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ - if ((!in_atomic() && !irqs_disabled()) || - system_state != SYSTEM_RUNNING || oops_in_progress) + if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + system_state != SYSTEM_RUNNING || oops_in_progress) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; From 6f80bd985fe242c2e6a8b6209ed20b0495d3d63b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: [PATCH 09/64] sched: Remove the CONFIG_PREEMPT_BKL case definition of cond_resched() CONFIG_PREEMPT_BKL doesn't exist anymore. So remove this config-on case definition of cond_resched(). Reported-by: Peter Zijlstra Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-5-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9bada20e2b2..e2bdf18e05c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2285,17 +2285,12 @@ static inline int need_resched(void) * cond_resched_softirq() will enable bhs before scheduling. */ extern int _cond_resched(void); -#ifdef CONFIG_PREEMPT_BKL -static inline int cond_resched(void) -{ - return 0; -} -#else + static inline int cond_resched(void) { return _cond_resched(); } -#endif + extern int cond_resched_lock(spinlock_t * lock); extern int cond_resched_softirq(void); static inline int cond_resched_bkl(void) From 613afbf83298efaead05ebcac23d2285609d7160 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: [PATCH 10/64] sched: Pull up the might_sleep() check into cond_resched() might_sleep() is called late-ish in cond_resched(), after the need_resched()/preempt enabled/system running tests are checked. It's better to check the sleeps while atomic earlier and not depend on some environment datas that reduce the chances to detect a problem. Also define cond_resched_*() helpers as macros, so that the FILE/LINE reported in the sleeping while atomic warning displays the real origin and not sched.h Changes in v2: - Call __might_sleep() directly instead of might_sleep() which may call cond_resched() - Turn cond_resched() into a macro so that the file:line couple reported refers to the caller of cond_resched() and not __cond_resched() itself. Changes in v3: - Also propagate this __might_sleep() pull up to cond_resched_lock() and cond_resched_softirq() Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-6-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/dcache.c | 1 + include/linux/sched.h | 29 +++++++++++++++++++---------- kernel/sched.c | 12 +++++------- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 9e5cd3c3a6b..a100fa35a48 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "internal.h" int sysctl_vfs_cache_pressure __read_mostly = 100; diff --git a/include/linux/sched.h b/include/linux/sched.h index e2bdf18e05c..c41d424db88 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2286,17 +2286,26 @@ static inline int need_resched(void) */ extern int _cond_resched(void); -static inline int cond_resched(void) -{ - return _cond_resched(); -} +#define cond_resched() ({ \ + __might_sleep(__FILE__, __LINE__, 0); \ + _cond_resched(); \ +}) -extern int cond_resched_lock(spinlock_t * lock); -extern int cond_resched_softirq(void); -static inline int cond_resched_bkl(void) -{ - return _cond_resched(); -} +extern int __cond_resched_lock(spinlock_t *lock); + +#define cond_resched_lock(lock) ({ \ + __might_sleep(__FILE__, __LINE__, PREEMPT_OFFSET); \ + __cond_resched_lock(lock); \ +}) + +extern int __cond_resched_softirq(void); + +#define cond_resched_softirq() ({ \ + __might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \ + __cond_resched_softirq(); \ +}) + +#define cond_resched_bkl() cond_resched() /* * Does a critical section need to be broken due to another diff --git a/kernel/sched.c b/kernel/sched.c index 3ff4d004bd9..1f7919add8a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,8 +6610,6 @@ static inline int should_resched(void) static void __cond_resched(void) { - __might_sleep(__FILE__, __LINE__, 0); - add_preempt_count(PREEMPT_ACTIVE); schedule(); sub_preempt_count(PREEMPT_ACTIVE); @@ -6628,14 +6626,14 @@ int __sched _cond_resched(void) EXPORT_SYMBOL(_cond_resched); /* - * cond_resched_lock() - if a reschedule is pending, drop the given lock, + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, * call schedule, and on return reacquire the lock. * * This works OK both with and without CONFIG_PREEMPT. We do strange low-level * operations here to prevent schedule() from being called twice (once via * spin_unlock(), once by hand). */ -int cond_resched_lock(spinlock_t *lock) +int __cond_resched_lock(spinlock_t *lock) { int resched = should_resched(); int ret = 0; @@ -6651,9 +6649,9 @@ int cond_resched_lock(spinlock_t *lock) } return ret; } -EXPORT_SYMBOL(cond_resched_lock); +EXPORT_SYMBOL(__cond_resched_lock); -int __sched cond_resched_softirq(void) +int __sched __cond_resched_softirq(void) { BUG_ON(!in_softirq()); @@ -6665,7 +6663,7 @@ int __sched cond_resched_softirq(void) } return 0; } -EXPORT_SYMBOL(cond_resched_softirq); +EXPORT_SYMBOL(__cond_resched_softirq); /** * yield - yield the current processor to other threads. From def01bc53d03881acfc393bd10a5c7575187e008 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 Jul 2009 15:44:29 +0200 Subject: [PATCH 11/64] sched: Convert the only user of cond_resched_bkl to use cond_resched() fs/locks.c:flock_lock_file() is the only user of cond_resched_bkl() This helper doesn't do anything more than cond_resched(). The latter naming is enough to explain that we are rescheduling if needed. The bkl suffix suggests another semantics but it's actually a synonym of cond_resched(). Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1247725694-6082-7-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- fs/locks.c | 2 +- include/linux/sched.h | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index b6440f52178..2eb81975c99 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) * give it the opportunity to lock the file. */ if (found) - cond_resched_bkl(); + cond_resched(); find_conflict: for_each_lock(inode, before) { diff --git a/include/linux/sched.h b/include/linux/sched.h index c41d424db88..cbbfca69aa4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2305,8 +2305,6 @@ extern int __cond_resched_softirq(void); __cond_resched_softirq(); \ }) -#define cond_resched_bkl() cond_resched() - /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPT, From a004cd42181409eda70804ded240a791f4564d61 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Jul 2009 09:54:05 +0200 Subject: [PATCH 12/64] sched: Fix return value of migration_init() migration_init() returns the return value of the hotplug notifier. In the success case this is NOTIFY_OK which is 1. initcall_debug evaluates that as an error code because init calls are expected to return 0 on success. Signed-off-by: Thomas Gleixner --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 1f7919add8a..953f037dc05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7654,7 +7654,7 @@ static int __init migration_init(void) migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); - return err; + return 0; } early_initcall(migration_init); #endif From 716a42348cdaf04534b15fbdc9c83e25baebfed5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 24 Jul 2009 20:05:23 +0200 Subject: [PATCH 13/64] sched: Fix cond_resched_lock() in !CONFIG_PREEMPT The might_sleep() test inside cond_resched_lock() assumes the spinlock is held and then preemption is disabled. This is true with CONFIG_PREEMPT but the preempt_count() doesn't change otherwise. Check by starting from the appropriate preempt offset depending on the config. Reported-by: Li Zefan Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <1248458723-12146-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index cbbfca69aa4..c472414953b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2293,8 +2293,14 @@ extern int _cond_resched(void); extern int __cond_resched_lock(spinlock_t *lock); +#ifdef CONFIG_PREEMPT +#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET +#else +#define PREEMPT_LOCK_OFFSET 0 +#endif + #define cond_resched_lock(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_OFFSET); \ + __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ __cond_resched_lock(lock); \ }) From a5004278f0525dcb9aa43703ef77bf371ea837cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jul 2009 14:04:49 +0200 Subject: [PATCH 14/64] sched: Fix cgroup smp fairness Commit ec4e0e2fe018992d980910db901637c814575914 ("fix inconsistency when redistribute per-cpu tg->cfs_rq shares") broke cgroup smp fairness. In order to avoid starvation of newly placed tasks, we never quite set the share of an empty cpu group-task to 0, but instead we set it as if there's a single NICE-0 task present. If however we actually set this in cfs_rq[cpu]->shares, that means the total shares for that group will be slightly inflated every time we balance, causing the observed unfairness. Fix this by setting cfs_rq[cpu]->shares to 0 but actually setting the effective weight of the related se to the inflated number. Signed-off-by: Peter Zijlstra LKML-Reference: <1248696557.6987.1615.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index ce1056e9b02..26976cd8be0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1523,13 +1523,18 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long sd_shares, unsigned long sd_rq_weight) { - unsigned long shares; unsigned long rq_weight; + unsigned long shares; + int boost = 0; if (!tg->se[cpu]) return; rq_weight = tg->cfs_rq[cpu]->rq_weight; + if (!rq_weight) { + boost = 1; + rq_weight = NICE_0_LOAD; + } /* * \Sum shares * rq_weight @@ -1546,8 +1551,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); - tg->cfs_rq[cpu]->shares = shares; - + tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); } @@ -1560,7 +1564,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0; + unsigned long weight, rq_weight = 0, eff_weight = 0; unsigned long shares = 0; struct sched_domain *sd = data; int i; @@ -1572,11 +1576,13 @@ static int tg_shares_up(struct task_group *tg, void *data) * run here it will not get delayed by group starvation. */ weight = tg->cfs_rq[i]->load.weight; + tg->cfs_rq[i]->rq_weight = weight; + rq_weight += weight; + if (!weight) weight = NICE_0_LOAD; - tg->cfs_rq[i]->rq_weight = weight; - rq_weight += weight; + eff_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1586,8 +1592,14 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight); + for_each_cpu(i, sched_domain_span(sd)) { + unsigned long sd_rq_weight = rq_weight; + + if (!tg->cfs_rq[i]->rq_weight) + sd_rq_weight = eff_weight; + + update_group_shares_cpu(tg, i, shares, sd_rq_weight); + } return 0; } From e709715915d69b6a929d77e7652c9c3fea61c317 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Jun 2009 15:41:20 +0200 Subject: [PATCH 15/64] sched: Optimize unused cgroup configuration When cgroup group scheduling is built in, skip some code paths if we don't have any (but the root) cgroups configured. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 26976cd8be0..ca1f76ba777 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1629,8 +1629,14 @@ static int tg_load_down(struct task_group *tg, void *data) static void update_shares(struct sched_domain *sd) { - u64 now = cpu_clock(raw_smp_processor_id()); - s64 elapsed = now - sd->last_update; + s64 elapsed; + u64 now; + + if (root_task_group_empty()) + return; + + now = cpu_clock(raw_smp_processor_id()); + elapsed = now - sd->last_update; if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { sd->last_update = now; @@ -1640,6 +1646,9 @@ static void update_shares(struct sched_domain *sd) static void update_shares_locked(struct rq *rq, struct sched_domain *sd) { + if (root_task_group_empty()) + return; + spin_unlock(&rq->lock); update_shares(sd); spin_lock(&rq->lock); @@ -1647,6 +1656,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd) static void update_h_load(long cpu) { + if (root_task_group_empty()) + return; + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } From da19ab510343c6496fe8b8f890091296032025c9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Jul 2009 00:21:22 -0400 Subject: [PATCH 16/64] sched: Check for pushing rt tasks after all scheduling The current method for pushing RT tasks after scheduling only happens after a context switch. But we found cases where a task is set up on a run queue to be pushed but the push never happens because the schedule chooses the same task. This bug was found with the help of Gregory Haskins and the use of ftrace (trace_printk). It tooks several days for both of us analyzing the code and the trace output to find this. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <20090729042526.205923666@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/sched.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index ca1f76ba777..a030d4514cd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2839,14 +2839,14 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * with the lock held can cause deadlocks; see schedule() for * details.) */ -static void finish_task_switch(struct rq *rq, struct task_struct *prev) +static int finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; long prev_state; -#ifdef CONFIG_SMP int post_schedule = 0; +#ifdef CONFIG_SMP if (current->sched_class->needs_post_schedule) post_schedule = current->sched_class->needs_post_schedule(rq); #endif @@ -2868,10 +2868,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) finish_arch_switch(prev); perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif fire_sched_in_preempt_notifiers(current); if (mm) @@ -2884,6 +2880,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } + + return post_schedule; } /** @@ -2894,8 +2892,15 @@ asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); + int post_schedule; + + post_schedule = finish_task_switch(rq, prev); + +#ifdef CONFIG_SMP + if (post_schedule) + current->sched_class->post_schedule(rq); +#endif - finish_task_switch(rq, prev); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ preempt_enable(); @@ -2908,7 +2913,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) * context_switch - switch to the new MM and the new * thread's register state. */ -static inline void +static inline int context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2955,7 +2960,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ - finish_task_switch(this_rq(), prev); + return finish_task_switch(this_rq(), prev); } /* @@ -5366,6 +5371,7 @@ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; + int post_schedule = 0; struct rq *rq; int cpu; @@ -5416,15 +5422,25 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; - context_switch(rq, prev, next); /* unlocks the rq */ + post_schedule = context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else + } else { +#ifdef CONFIG_SMP + if (current->sched_class->needs_post_schedule) + post_schedule = current->sched_class->needs_post_schedule(rq); +#endif spin_unlock_irq(&rq->lock); + } + +#ifdef CONFIG_SMP + if (post_schedule) + current->sched_class->post_schedule(rq); +#endif if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; From c3a2ae3d93c0f10d29c071f599764d00b8de00cb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Jul 2009 00:21:23 -0400 Subject: [PATCH 17/64] sched: Add new prio to cpupri before removing old prio We need to add the new prio to the cpupri accounting before removing the old prio. This is because removing the old prio first will open a race window where the cpu will be removed from pri_active. In this case the cpu will not be visible for RT push and pulls. This could cause a RT task to not migrate appropriately, and create a very large latency. This bug was found with the use of ftrace sched events and trace_printk. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra LKML-Reference: <20090729042526.438281019@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/sched_cpupri.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index d014efbf947..0f052fc674d 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -127,21 +127,11 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) /* * If the cpu was currently mapped to a different value, we - * first need to unmap the old value + * need to map it to the new value then remove the old value. + * Note, we must add the new value first, otherwise we risk the + * cpu being cleared from pri_active, and this cpu could be + * missed for a push or pull. */ - if (likely(oldpri != CPUPRI_INVALID)) { - struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; - - spin_lock_irqsave(&vec->lock, flags); - - vec->count--; - if (!vec->count) - clear_bit(oldpri, cp->pri_active); - cpumask_clear_cpu(cpu, vec->mask); - - spin_unlock_irqrestore(&vec->lock, flags); - } - if (likely(newpri != CPUPRI_INVALID)) { struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; @@ -154,6 +144,18 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) spin_unlock_irqrestore(&vec->lock, flags); } + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpumask_clear_cpu(cpu, vec->mask); + + spin_unlock_irqrestore(&vec->lock, flags); + } *currpri = newpri; } From 3f029d3c6d62068d59301d90c18dbde8ee402107 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Wed, 29 Jul 2009 11:08:47 -0400 Subject: [PATCH 18/64] sched: Enhance the pre/post scheduling logic We currently have an explicit "needs_post" vtable method which returns a stack variable for whether we should later run post-schedule. This leads to an awkward exchange of the variable as it bubbles back up out of the context switch. Peter Zijlstra observed that this information could be stored in the run-queue itself instead of handled on the stack. Therefore, we revert to the method of having context_switch return void, and update an internal rq->post_schedule variable when we require further processing. In addition, we fix a race condition where we try to access current->sched_class without holding the rq->lock. This is technically racy, as the sched-class could change out from under us. Instead, we reference the per-rq post_schedule variable with the runqueue unlocked, but with preemption disabled to see if we need to reacquire the rq->lock. Finally, we clean the code up slightly by removing the #ifdef CONFIG_SMP conditionals from the schedule() call, and implement some inline helper functions instead. This patch passes checkpatch, and rt-migrate. Signed-off-by: Gregory Haskins Signed-off-by: Peter Zijlstra LKML-Reference: <20090729150422.17691.55590.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched.c | 84 ++++++++++++++++++++++++++----------------- kernel/sched_rt.c | 31 ++++++---------- 3 files changed, 62 insertions(+), 54 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c35bc29d2a..195d72d5c10 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1047,7 +1047,6 @@ struct sched_class { struct rq *busiest, struct sched_domain *sd, enum cpu_idle_type idle); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); - int (*needs_post_schedule) (struct rq *this_rq); void (*post_schedule) (struct rq *this_rq); void (*task_wake_up) (struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched.c b/kernel/sched.c index a030d4514cd..613fee54fc8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -616,6 +616,7 @@ struct rq { unsigned char idle_at_tick; /* For active balancing */ + int post_schedule; int active_balance; int push_cpu; /* cpu of this runqueue: */ @@ -2839,17 +2840,11 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * with the lock held can cause deadlocks; see schedule() for * details.) */ -static int finish_task_switch(struct rq *rq, struct task_struct *prev) +static void finish_task_switch(struct rq *rq, struct task_struct *prev) __releases(rq->lock) { struct mm_struct *mm = rq->prev_mm; long prev_state; - int post_schedule = 0; - -#ifdef CONFIG_SMP - if (current->sched_class->needs_post_schedule) - post_schedule = current->sched_class->needs_post_schedule(rq); -#endif rq->prev_mm = NULL; @@ -2880,10 +2875,44 @@ static int finish_task_switch(struct rq *rq, struct task_struct *prev) kprobe_flush_task(prev); put_task_struct(prev); } - - return post_schedule; } +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ + if (rq->post_schedule) { + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + if (rq->curr->sched_class->post_schedule) + rq->curr->sched_class->post_schedule(rq); + spin_unlock_irqrestore(&rq->lock, flags); + + rq->post_schedule = 0; + } +} + +#else + +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ +} + +#endif + /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -2892,14 +2921,14 @@ asmlinkage void schedule_tail(struct task_struct *prev) __releases(rq->lock) { struct rq *rq = this_rq(); - int post_schedule; - post_schedule = finish_task_switch(rq, prev); + finish_task_switch(rq, prev); -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif + /* + * FIXME: do we need to worry about rq being invalidated by the + * task_switch? + */ + post_schedule(rq); #ifdef __ARCH_WANT_UNLOCKED_CTXSW /* In this case, finish_task_switch does not reenable preemption */ @@ -2913,7 +2942,7 @@ asmlinkage void schedule_tail(struct task_struct *prev) * context_switch - switch to the new MM and the new * thread's register state. */ -static inline int +static inline void context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2960,7 +2989,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ - return finish_task_switch(this_rq(), prev); + finish_task_switch(this_rq(), prev); } /* @@ -5371,7 +5400,6 @@ asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; - int post_schedule = 0; struct rq *rq; int cpu; @@ -5403,10 +5431,7 @@ need_resched_nonpreemptible: switch_count = &prev->nvcsw; } -#ifdef CONFIG_SMP - if (prev->sched_class->pre_schedule) - prev->sched_class->pre_schedule(rq, prev); -#endif + pre_schedule(rq, prev); if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); @@ -5422,25 +5447,17 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; - post_schedule = context_switch(rq, prev, next); /* unlocks the rq */ + context_switch(rq, prev, next); /* unlocks the rq */ /* * the context switch might have flipped the stack from under * us, hence refresh the local variables. */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else { -#ifdef CONFIG_SMP - if (current->sched_class->needs_post_schedule) - post_schedule = current->sched_class->needs_post_schedule(rq); -#endif + } else spin_unlock_irq(&rq->lock); - } -#ifdef CONFIG_SMP - if (post_schedule) - current->sched_class->post_schedule(rq); -#endif + post_schedule(rq); if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; @@ -9403,6 +9420,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; rq->push_cpu = 0; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3918e01994e..a8f89bc3e5e 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1056,6 +1056,11 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return p; } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + static struct task_struct *pick_next_task_rt(struct rq *rq) { struct task_struct *p = _pick_next_task_rt(rq); @@ -1064,6 +1069,12 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) if (p) dequeue_pushable_task(rq, p); + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); + return p; } @@ -1262,11 +1273,6 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) return lowest_rq; } -static inline int has_pushable_tasks(struct rq *rq) -{ - return !plist_head_empty(&rq->rt.pushable_tasks); -} - static struct task_struct *pick_next_pushable_task(struct rq *rq) { struct task_struct *p; @@ -1466,23 +1472,9 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) pull_rt_task(rq); } -/* - * assumes rq->lock is held - */ -static int needs_post_schedule_rt(struct rq *rq) -{ - return has_pushable_tasks(rq); -} - static void post_schedule_rt(struct rq *rq) { - /* - * This is only called if needs_post_schedule_rt() indicates that - * we need to push tasks away - */ - spin_lock_irq(&rq->lock); push_rt_tasks(rq); - spin_unlock_irq(&rq->lock); } /* @@ -1758,7 +1750,6 @@ static const struct sched_class rt_sched_class = { .rq_online = rq_online_rt, .rq_offline = rq_offline_rt, .pre_schedule = pre_schedule_rt, - .needs_post_schedule = needs_post_schedule_rt, .post_schedule = post_schedule_rt, .task_wake_up = task_wake_up_rt, .switched_from = switched_from_rt, From 00aec93d10a051ea64f83eff75d4065a19508ea6 Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Thu, 30 Jul 2009 10:57:23 -0400 Subject: [PATCH 19/64] sched: Fully integrate cpus_active_map and root-domain code Reflect "active" cpus in the rq->rd->online field, instead of the online_map. The motivation is that things that use the root-domain code (such as cpupri) only care about cpus classified as "active" anyway. By synchronizing the root-domain state with the active map, we allow several optimizations. For instance, we can remove an extra cpumask_and from the scheduler hotpath by utilizing rq->rd->online (since it is now a cached version of cpu_active_map & rq->rd->span). Signed-off-by: Gregory Haskins Acked-by: Peter Zijlstra Acked-by: Max Krasnyansky Signed-off-by: Peter Zijlstra LKML-Reference: <20090730145723.25226.24493.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 10 +++++++--- kernel/sched_rt.c | 7 ------- 3 files changed, 8 insertions(+), 11 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 613fee54fc8..475138c4254 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7927,7 +7927,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) rq->rd = rd; cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_online_mask)) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); spin_unlock_irqrestore(&rq->lock, flags); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 652e8bdef9a..49347298487 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1046,17 +1046,21 @@ static void yield_task_fair(struct rq *rq) * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (cpu_active_mask) + * hence we need to mask them out (rq->rd->online) * * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) + +#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) + static int wake_idle(int cpu, struct task_struct *p) { struct sched_domain *sd; int i; unsigned int chosen_wakeup_cpu; int this_cpu; + struct rq *task_rq = task_rq(p); /* * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu @@ -1089,10 +1093,10 @@ static int wake_idle(int cpu, struct task_struct *p) for_each_domain(cpu, sd) { if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) - && !task_hot(p, task_rq(p)->clock, sd))) { + && !task_hot(p, task_rq->clock, sd))) { for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { - if (cpu_active(i) && idle_cpu(i)) { + if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, se.nr_wakeups_idle); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index a8f89bc3e5e..13f728ef5b3 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1172,13 +1172,6 @@ static int find_lowest_rq(struct task_struct *task) if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) return -1; /* No targets found */ - /* - * Only consider CPUs that are usable for migration. - * I guess we might want to change cpupri_find() to ignore those - * in the first place. - */ - cpumask_and(lowest_mask, lowest_mask, cpu_active_mask); - /* * At this point we have built a mask of cpus representing the * lowest priority tasks in the system. Now we want to elect From 8f48894fcc89ddec62e1762f73a0825793e59e91 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 24 Jul 2009 12:25:30 +0200 Subject: [PATCH 20/64] sched: Add debug check to task_of() A frequent mistake appears to be to call task_of() on a scheduler entity that is not actually a task, which can result in a wild pointer. Add a check to catch these mistakes. Suggested-by: Ingo Molnar Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 20 ++++++++++++++------ kernel/sched_rt.c | 18 +++++++++++++----- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 49347298487..342000b31ad 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -79,11 +79,6 @@ static const struct sched_class fair_sched_class; * CFS operations on generic schedulable entities: */ -static inline struct task_struct *task_of(struct sched_entity *se) -{ - return container_of(se, struct task_struct, se); -} - #ifdef CONFIG_FAIR_GROUP_SCHED /* cpu runqueue to which this cfs_rq is attached */ @@ -95,6 +90,14 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) /* An entity is a task if it doesn't "own" a runqueue */ #define entity_is_task(se) (!se->my_q) +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!entity_is_task(se)); +#endif + return container_of(se, struct task_struct, se); +} + /* Walk up scheduling entities hierarchy */ #define for_each_sched_entity(se) \ for (; se; se = se->parent) @@ -186,7 +189,12 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse) } } -#else /* CONFIG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} static inline struct rq *rq_of(struct cfs_rq *cfs_rq) { diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 13f728ef5b3..f365e66b3d4 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -3,15 +3,18 @@ * policies) */ -static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) -{ - return container_of(rt_se, struct task_struct, rt); -} - #ifdef CONFIG_RT_GROUP_SCHED #define rt_entity_is_task(rt_se) (!(rt_se)->my_q) +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!rt_entity_is_task(rt_se)); +#endif + return container_of(rt_se, struct task_struct, rt); +} + static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return rt_rq->rq; @@ -26,6 +29,11 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) #define rt_entity_is_task(rt_se) (1) +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ + return container_of(rt_se, struct task_struct, rt); +} + static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) { return container_of(rt_rq, struct rq, rt); From 693525e3bea25cf2ee6cf2b862ba7c148e891df2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 13:56:38 +0200 Subject: [PATCH 21/64] sched: Ensure the migration task doesn't go away during use Like sched_migrate_task(), set_cpus_allowed_ptr() should hold onto the migration thread too. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched.c b/kernel/sched.c index 475138c4254..7f83be35d65 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7078,8 +7078,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) if (migrate_task(p, cpumask_any_and(cpu_online_mask, new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ + struct task_struct *mt = rq->migration_thread; + + get_task_struct(mt); task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); + put_task_struct(mt); wait_for_completion(&req.done); tlb_migrate_finish(p->mm); return 0; From bcf08df3b23b3d13bf8c4ad6bd744a6ad30015fb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 19 Apr 2008 12:11:10 +0200 Subject: [PATCH 22/64] sched: Fix cpupri build on !CONFIG_SMP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This build bug: In file included from kernel/sched.c:1765: kernel/sched_rt.c: In function ‘has_pushable_tasks’: kernel/sched_rt.c:1069: error: ‘struct rt_rq’ has no member named ‘pushable_tasks’ kernel/sched_rt.c: In function ‘pick_next_task_rt’: kernel/sched_rt.c:1084: error: ‘struct rq’ has no member named ‘post_schedule’ Triggers because both pushable_tasks and post_schedule are SMP-only fields. Move pushable_tasks() to the SMP section and #ifdef the post_schedule use. Cc: Gregory Haskins Cc: Peter Zijlstra LKML-Reference: <20090729150422.17691.55590.stgit@dev.haskins.net> Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f365e66b3d4..3d4020a9ba1 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -136,6 +136,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); } +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + #else static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) @@ -1064,11 +1069,6 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) return p; } -static inline int has_pushable_tasks(struct rq *rq) -{ - return !plist_head_empty(&rq->rt.pushable_tasks); -} - static struct task_struct *pick_next_task_rt(struct rq *rq) { struct task_struct *p = _pick_next_task_rt(rq); @@ -1077,11 +1077,13 @@ static struct task_struct *pick_next_task_rt(struct rq *rq) if (p) dequeue_pushable_task(rq, p); +#ifdef CONFIG_SMP /* * We detect this state here so that we can avoid taking the RQ * lock again later if there is no need to push */ rq->post_schedule = has_pushable_tasks(rq); +#endif return p; } From 8e5b59a2d728e6963b35dba8bb36e0b70267462e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 6 Aug 2009 16:02:50 -0700 Subject: [PATCH 23/64] sched: Add default defines for PREEMPT_ACTIVE The PREEMPT_ACTIVE setting doesn't actually need to be arch-specific, so set up a sane default for all arches to (hopefully) migrate to. > if we look at linux/hardirq.h, it makes this claim: > * - bit 28 is the PREEMPT_ACTIVE flag > if that's true, then why are we letting any arch set this define ? a > quick survey shows that half the arches (11) are using 0x10000000 (bit > 28) while the other half (10) are using 0x4000000 (bit 26). and then > there is the ia64 oddity which uses bit 30. the exact value here > shouldnt really matter across arches though should it ? actually alpha, arm and avr32 also use bit 30 (0x40000000), there are only five (or eight, depending on how you count) architectures (blackfin, h8300, m68k, s390 and sparc) using bit 26. Signed-off-by: Arnd Bergmann Signed-off-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 8246c697863..0d885fd7511 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -64,6 +64,12 @@ #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) +#ifndef PREEMPT_ACTIVE +#define PREEMPT_ACTIVE_BITS 1 +#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS) +#define PREEMPT_ACTIVE (__IRQ_MASK(PREEMPT_ACTIVE_BITS) << PREEMPT_ACTIVE_SHIFT) +#endif + #if PREEMPT_ACTIVE < (1 << (NMI_SHIFT + NMI_BITS)) #error PREEMPT_ACTIVE is too low! #endif From 1314562a9ae5f39f6f595656023c1baf970831ef Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 18 Aug 2009 15:06:02 +0900 Subject: [PATCH 24/64] sched, task_struct: stack_canary is not needed without CC_STACKPROTECTOR The field stack_canary is only used with CC_STACKPROTECTOR. This patch reduces task_struct size without CC_STACKPROTECTOR. Signed-off-by: Hiroshi Shimamoto LKML-Reference: <4A8A44CA.2020701@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 195d72d5c10..7bc2d929083 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1237,8 +1237,10 @@ struct task_struct { pid_t pid; pid_t tgid; +#ifdef CONFIG_CC_STACKPROTECTOR /* Canary value for the -fstack-protector gcc feature */ unsigned long stack_canary; +#endif /* * pointers to (original) parent process, youngest child, younger sibling, From 49a02c514d967921a908ac64e9c0ec0f0fc17fd8 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:51:52 +0200 Subject: [PATCH 25/64] sched: Use structure to store local data in __build_sched_domains Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105152.GB29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 165 ++++++++++++++++++++++++++----------------------- 1 file changed, 89 insertions(+), 76 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 1b59e265273..565ff775fcd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8091,6 +8091,22 @@ struct static_sched_domain { DECLARE_BITMAP(span, CONFIG_NR_CPUS); }; +struct s_data { +#ifdef CONFIG_NUMA + int sd_allnodes; + cpumask_var_t domainspan; + cpumask_var_t covered; + cpumask_var_t notcovered; +#endif + cpumask_var_t nodemask; + cpumask_var_t this_sibling_map; + cpumask_var_t this_core_map; + cpumask_var_t send_covered; + cpumask_var_t tmpmask; + struct sched_group **sched_group_nodes; + struct root_domain *rd; +}; + /* * SMT sched-domains: */ @@ -8385,54 +8401,49 @@ static void set_domain_attribute(struct sched_domain *sd, static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { + struct s_data d; int i, err = -ENOMEM; - struct root_domain *rd; - cpumask_var_t nodemask, this_sibling_map, this_core_map, send_covered, - tmpmask; #ifdef CONFIG_NUMA - cpumask_var_t domainspan, covered, notcovered; - struct sched_group **sched_group_nodes = NULL; - int sd_allnodes = 0; - - if (!alloc_cpumask_var(&domainspan, GFP_KERNEL)) + d.sd_allnodes = 0; + if (!alloc_cpumask_var(&d.domainspan, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.covered, GFP_KERNEL)) goto free_domainspan; - if (!alloc_cpumask_var(¬covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.notcovered, GFP_KERNEL)) goto free_covered; #endif - if (!alloc_cpumask_var(&nodemask, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.nodemask, GFP_KERNEL)) goto free_notcovered; - if (!alloc_cpumask_var(&this_sibling_map, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.this_sibling_map, GFP_KERNEL)) goto free_nodemask; - if (!alloc_cpumask_var(&this_core_map, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.this_core_map, GFP_KERNEL)) goto free_this_sibling_map; - if (!alloc_cpumask_var(&send_covered, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.send_covered, GFP_KERNEL)) goto free_this_core_map; - if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) + if (!alloc_cpumask_var(&d.tmpmask, GFP_KERNEL)) goto free_send_covered; #ifdef CONFIG_NUMA /* * Allocate the per-node list of sched groups */ - sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), - GFP_KERNEL); - if (!sched_group_nodes) { + d.sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), + GFP_KERNEL); + if (!d.sched_group_nodes) { printk(KERN_WARNING "Can not alloc sched group node list\n"); goto free_tmpmask; } #endif - rd = alloc_rootdomain(); - if (!rd) { + d.rd = alloc_rootdomain(); + if (!d.rd) { printk(KERN_WARNING "Cannot alloc root domain\n"); goto free_sched_groups; } #ifdef CONFIG_NUMA - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = sched_group_nodes; + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d.sched_group_nodes; #endif /* @@ -8441,18 +8452,20 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { struct sched_domain *sd = NULL, *p; - cpumask_and(nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); + cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), + cpu_map); #ifdef CONFIG_NUMA if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN*cpumask_weight(nodemask)) { + SD_NODES_PER_DOMAIN*cpumask_weight(d.nodemask)) { sd = &per_cpu(allnodes_domains, i).sd; SD_INIT(sd, ALLNODES); set_domain_attribute(sd, attr); cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, + d.tmpmask); p = sd; - sd_allnodes = 1; + d.sd_allnodes = 1; } else p = NULL; @@ -8471,11 +8484,11 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), nodemask); + cpumask_copy(sched_domain_span(sd), d.nodemask); sd->parent = p; if (p) p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_phys_group(i, cpu_map, &sd->groups, d.tmpmask); #ifdef CONFIG_SCHED_MC p = sd; @@ -8486,7 +8499,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpu_coregroup_mask(i)); sd->parent = p; p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_core_group(i, cpu_map, &sd->groups, d.tmpmask); #endif #ifdef CONFIG_SCHED_SMT @@ -8498,54 +8511,54 @@ static int __build_sched_domains(const struct cpumask *cpu_map, topology_thread_cpumask(i), cpu_map); sd->parent = p; p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); + cpu_to_cpu_group(i, cpu_map, &sd->groups, d.tmpmask); #endif } #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - cpumask_and(this_sibling_map, + cpumask_and(d.this_sibling_map, topology_thread_cpumask(i), cpu_map); - if (i != cpumask_first(this_sibling_map)) + if (i != cpumask_first(d.this_sibling_map)) continue; - init_sched_build_groups(this_sibling_map, cpu_map, + init_sched_build_groups(d.this_sibling_map, cpu_map, &cpu_to_cpu_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } #endif #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ for_each_cpu(i, cpu_map) { - cpumask_and(this_core_map, cpu_coregroup_mask(i), cpu_map); - if (i != cpumask_first(this_core_map)) + cpumask_and(d.this_core_map, cpu_coregroup_mask(i), cpu_map); + if (i != cpumask_first(d.this_core_map)) continue; - init_sched_build_groups(this_core_map, cpu_map, + init_sched_build_groups(d.this_core_map, cpu_map, &cpu_to_core_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } #endif /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) + cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(d.nodemask)) continue; - init_sched_build_groups(nodemask, cpu_map, + init_sched_build_groups(d.nodemask, cpu_map, &cpu_to_phys_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } #ifdef CONFIG_NUMA /* Set up node groups */ - if (sd_allnodes) { + if (d.sd_allnodes) { init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, - send_covered, tmpmask); + d.send_covered, d.tmpmask); } for (i = 0; i < nr_node_ids; i++) { @@ -8553,15 +8566,15 @@ static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_group *sg, *prev; int j; - cpumask_clear(covered); - cpumask_and(nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(nodemask)) { - sched_group_nodes[i] = NULL; + cpumask_clear(d.covered); + cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); + if (cpumask_empty(d.nodemask)) { + d.sched_group_nodes[i] = NULL; continue; } - sched_domain_node_span(i, domainspan); - cpumask_and(domainspan, domainspan, cpu_map); + sched_domain_node_span(i, d.domainspan); + cpumask_and(d.domainspan, d.domainspan, cpu_map); sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, i); @@ -8570,30 +8583,30 @@ static int __build_sched_domains(const struct cpumask *cpu_map, "node %d\n", i); goto error; } - sched_group_nodes[i] = sg; - for_each_cpu(j, nodemask) { + d.sched_group_nodes[i] = sg; + for_each_cpu(j, d.nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j).sd; sd->groups = sg; } sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), nodemask); + cpumask_copy(sched_group_cpus(sg), d.nodemask); sg->next = sg; - cpumask_or(covered, covered, nodemask); + cpumask_or(d.covered, d.covered, d.nodemask); prev = sg; for (j = 0; j < nr_node_ids; j++) { int n = (i + j) % nr_node_ids; - cpumask_complement(notcovered, covered); - cpumask_and(tmpmask, notcovered, cpu_map); - cpumask_and(tmpmask, tmpmask, domainspan); - if (cpumask_empty(tmpmask)) + cpumask_complement(d.notcovered, d.covered); + cpumask_and(d.tmpmask, d.notcovered, cpu_map); + cpumask_and(d.tmpmask, d.tmpmask, d.domainspan); + if (cpumask_empty(d.tmpmask)) break; - cpumask_and(tmpmask, tmpmask, cpumask_of_node(n)); - if (cpumask_empty(tmpmask)) + cpumask_and(d.tmpmask, d.tmpmask, cpumask_of_node(n)); + if (cpumask_empty(d.tmpmask)) continue; sg = kmalloc_node(sizeof(struct sched_group) + @@ -8605,9 +8618,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, goto error; } sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), tmpmask); + cpumask_copy(sched_group_cpus(sg), d.tmpmask); sg->next = prev->next; - cpumask_or(covered, covered, tmpmask); + cpumask_or(d.covered, d.covered, d.tmpmask); prev->next = sg; prev = sg; } @@ -8638,13 +8651,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #ifdef CONFIG_NUMA for (i = 0; i < nr_node_ids; i++) - init_numa_sched_groups_power(sched_group_nodes[i]); + init_numa_sched_groups_power(d.sched_group_nodes[i]); - if (sd_allnodes) { + if (d.sd_allnodes) { struct sched_group *sg; cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg, - tmpmask); + d.tmpmask); init_numa_sched_groups_power(sg); } #endif @@ -8659,42 +8672,42 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #else sd = &per_cpu(phys_domains, i).sd; #endif - cpu_attach_domain(sd, rd, i); + cpu_attach_domain(sd, d.rd, i); } err = 0; free_tmpmask: - free_cpumask_var(tmpmask); + free_cpumask_var(d.tmpmask); free_send_covered: - free_cpumask_var(send_covered); + free_cpumask_var(d.send_covered); free_this_core_map: - free_cpumask_var(this_core_map); + free_cpumask_var(d.this_core_map); free_this_sibling_map: - free_cpumask_var(this_sibling_map); + free_cpumask_var(d.this_sibling_map); free_nodemask: - free_cpumask_var(nodemask); + free_cpumask_var(d.nodemask); free_notcovered: #ifdef CONFIG_NUMA - free_cpumask_var(notcovered); + free_cpumask_var(d.notcovered); free_covered: - free_cpumask_var(covered); + free_cpumask_var(d.covered); free_domainspan: - free_cpumask_var(domainspan); + free_cpumask_var(d.domainspan); out: #endif return err; free_sched_groups: #ifdef CONFIG_NUMA - kfree(sched_group_nodes); + kfree(d.sched_group_nodes); #endif goto free_tmpmask; #ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map, tmpmask); - free_rootdomain(rd); + free_sched_groups(cpu_map, d.tmpmask); + free_rootdomain(d.rd); goto free_tmpmask; #endif } From 2109b99ee192764b407dc7f52babb74740eea6f9 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:53:00 +0200 Subject: [PATCH 26/64] sched: Separate out allocation/free/goto-hell from __build_sched_domains Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105300.GC29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 171 ++++++++++++++++++++++++++++--------------------- 1 file changed, 99 insertions(+), 72 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 565ff775fcd..c5d1fee4236 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8107,6 +8107,23 @@ struct s_data { struct root_domain *rd; }; +enum s_alloc { + sa_sched_groups = 0, + sa_rootdomain, + sa_tmpmask, + sa_send_covered, + sa_this_core_map, + sa_this_sibling_map, + sa_nodemask, + sa_sched_group_nodes, +#ifdef CONFIG_NUMA + sa_notcovered, + sa_covered, + sa_domainspan, +#endif + sa_none, +}; + /* * SMT sched-domains: */ @@ -8394,6 +8411,77 @@ static void set_domain_attribute(struct sched_domain *sd, } } +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, + const struct cpumask *cpu_map) +{ + switch (what) { + case sa_sched_groups: + free_sched_groups(cpu_map, d->tmpmask); /* fall through */ + d->sched_group_nodes = NULL; + case sa_rootdomain: + free_rootdomain(d->rd); /* fall through */ + case sa_tmpmask: + free_cpumask_var(d->tmpmask); /* fall through */ + case sa_send_covered: + free_cpumask_var(d->send_covered); /* fall through */ + case sa_this_core_map: + free_cpumask_var(d->this_core_map); /* fall through */ + case sa_this_sibling_map: + free_cpumask_var(d->this_sibling_map); /* fall through */ + case sa_nodemask: + free_cpumask_var(d->nodemask); /* fall through */ + case sa_sched_group_nodes: +#ifdef CONFIG_NUMA + kfree(d->sched_group_nodes); /* fall through */ + case sa_notcovered: + free_cpumask_var(d->notcovered); /* fall through */ + case sa_covered: + free_cpumask_var(d->covered); /* fall through */ + case sa_domainspan: + free_cpumask_var(d->domainspan); /* fall through */ +#endif + case sa_none: + break; + } +} + +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, + const struct cpumask *cpu_map) +{ +#ifdef CONFIG_NUMA + if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL)) + return sa_none; + if (!alloc_cpumask_var(&d->covered, GFP_KERNEL)) + return sa_domainspan; + if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL)) + return sa_covered; + /* Allocate the per-node list of sched groups */ + d->sched_group_nodes = kcalloc(nr_node_ids, + sizeof(struct sched_group *), GFP_KERNEL); + if (!d->sched_group_nodes) { + printk(KERN_WARNING "Can not alloc sched group node list\n"); + return sa_notcovered; + } + sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes; +#endif + if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL)) + return sa_sched_group_nodes; + if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL)) + return sa_nodemask; + if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) + return sa_this_sibling_map; + if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) + return sa_this_core_map; + if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) + return sa_send_covered; + d->rd = alloc_rootdomain(); + if (!d->rd) { + printk(KERN_WARNING "Cannot alloc root domain\n"); + return sa_tmpmask; + } + return sa_rootdomain; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8401,50 +8489,17 @@ static void set_domain_attribute(struct sched_domain *sd, static int __build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) { + enum s_alloc alloc_state = sa_none; struct s_data d; - int i, err = -ENOMEM; + int i; #ifdef CONFIG_NUMA d.sd_allnodes = 0; - if (!alloc_cpumask_var(&d.domainspan, GFP_KERNEL)) - goto out; - if (!alloc_cpumask_var(&d.covered, GFP_KERNEL)) - goto free_domainspan; - if (!alloc_cpumask_var(&d.notcovered, GFP_KERNEL)) - goto free_covered; #endif - if (!alloc_cpumask_var(&d.nodemask, GFP_KERNEL)) - goto free_notcovered; - if (!alloc_cpumask_var(&d.this_sibling_map, GFP_KERNEL)) - goto free_nodemask; - if (!alloc_cpumask_var(&d.this_core_map, GFP_KERNEL)) - goto free_this_sibling_map; - if (!alloc_cpumask_var(&d.send_covered, GFP_KERNEL)) - goto free_this_core_map; - if (!alloc_cpumask_var(&d.tmpmask, GFP_KERNEL)) - goto free_send_covered; - -#ifdef CONFIG_NUMA - /* - * Allocate the per-node list of sched groups - */ - d.sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *), - GFP_KERNEL); - if (!d.sched_group_nodes) { - printk(KERN_WARNING "Can not alloc sched group node list\n"); - goto free_tmpmask; - } -#endif - - d.rd = alloc_rootdomain(); - if (!d.rd) { - printk(KERN_WARNING "Cannot alloc root domain\n"); - goto free_sched_groups; - } - -#ifdef CONFIG_NUMA - sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d.sched_group_nodes; -#endif + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); + if (alloc_state != sa_rootdomain) + goto error; + alloc_state = sa_sched_groups; /* * Set up domains for cpus specified by the cpu_map. @@ -8675,41 +8730,13 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpu_attach_domain(sd, d.rd, i); } - err = 0; + d.sched_group_nodes = NULL; /* don't free this we still need it */ + __free_domain_allocs(&d, sa_tmpmask, cpu_map); + return 0; -free_tmpmask: - free_cpumask_var(d.tmpmask); -free_send_covered: - free_cpumask_var(d.send_covered); -free_this_core_map: - free_cpumask_var(d.this_core_map); -free_this_sibling_map: - free_cpumask_var(d.this_sibling_map); -free_nodemask: - free_cpumask_var(d.nodemask); -free_notcovered: -#ifdef CONFIG_NUMA - free_cpumask_var(d.notcovered); -free_covered: - free_cpumask_var(d.covered); -free_domainspan: - free_cpumask_var(d.domainspan); -out: -#endif - return err; - -free_sched_groups: -#ifdef CONFIG_NUMA - kfree(d.sched_group_nodes); -#endif - goto free_tmpmask; - -#ifdef CONFIG_NUMA error: - free_sched_groups(cpu_map, d.tmpmask); - free_rootdomain(d.rd); - goto free_tmpmask; -#endif + __free_domain_allocs(&d, alloc_state, cpu_map); + return -ENOMEM; } static int build_sched_domains(const struct cpumask *cpu_map) From 7f4588f3aa395632fec9ba2e15a1920f0682fda0 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:54:06 +0200 Subject: [PATCH 27/64] sched: Separate out build of NUMA sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105406.GD29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 57 ++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index c5d1fee4236..dd95a470837 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8482,6 +8482,37 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, return sa_rootdomain; } +static struct sched_domain *__build_numa_sched_domains(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i) +{ + struct sched_domain *sd = NULL; +#ifdef CONFIG_NUMA + struct sched_domain *parent; + + d->sd_allnodes = 0; + if (cpumask_weight(cpu_map) > + SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) { + sd = &per_cpu(allnodes_domains, i).sd; + SD_INIT(sd, ALLNODES); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), cpu_map); + cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask); + d->sd_allnodes = 1; + } + parent = sd; + + sd = &per_cpu(node_domains, i).sd; + SD_INIT(sd, NODE); + set_domain_attribute(sd, attr); + sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); + sd->parent = parent; + if (parent) + parent->child = sd; + cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map); +#endif + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8510,31 +8541,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); -#ifdef CONFIG_NUMA - if (cpumask_weight(cpu_map) > - SD_NODES_PER_DOMAIN*cpumask_weight(d.nodemask)) { - sd = &per_cpu(allnodes_domains, i).sd; - SD_INIT(sd, ALLNODES); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), cpu_map); - cpu_to_allnodes_group(i, cpu_map, &sd->groups, - d.tmpmask); - p = sd; - d.sd_allnodes = 1; - } else - p = NULL; - - sd = &per_cpu(node_domains, i).sd; - SD_INIT(sd, NODE); - set_domain_attribute(sd, attr); - sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd)); - sd->parent = p; - if (p) - p->child = sd; - cpumask_and(sched_domain_span(sd), - sched_domain_span(sd), cpu_map); -#endif - + sd = __build_numa_sched_domains(&d, cpu_map, attr, i); p = sd; sd = &per_cpu(phys_domains, i).sd; SD_INIT(sd, CPU); From 87cce6622c2ab2f0e96ecc2a37133378a7db3177 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:54:55 +0200 Subject: [PATCH 28/64] sched: Separate out build of CPU sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105455.GE29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index dd95a470837..3d0666c5e02 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8513,6 +8513,22 @@ static struct sched_domain *__build_numa_sched_domains(struct s_data *d, return sd; } +static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd; + sd = &per_cpu(phys_domains, i).sd; + SD_INIT(sd, CPU); + set_domain_attribute(sd, attr); + cpumask_copy(sched_domain_span(sd), d->nodemask); + sd->parent = parent; + if (parent) + parent->child = sd; + cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask); + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8542,15 +8558,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, cpu_map); sd = __build_numa_sched_domains(&d, cpu_map, attr, i); - p = sd; - sd = &per_cpu(phys_domains, i).sd; - SD_INIT(sd, CPU); - set_domain_attribute(sd, attr); - cpumask_copy(sched_domain_span(sd), d.nodemask); - sd->parent = p; - if (p) - p->child = sd; - cpu_to_phys_group(i, cpu_map, &sd->groups, d.tmpmask); + sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); #ifdef CONFIG_SCHED_MC p = sd; From 410c408108bb85f32fe132aaf448388af0b6da64 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:56:14 +0200 Subject: [PATCH 29/64] sched: Separate out build of MC sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105614.GF29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 3d0666c5e02..5c829d4ba8f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8529,6 +8529,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_mc_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_MC + sd = &per_cpu(core_domains, i).sd; + SD_INIT(sd, MC); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8559,18 +8576,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); - -#ifdef CONFIG_SCHED_MC - p = sd; - sd = &per_cpu(core_domains, i).sd; - SD_INIT(sd, MC); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), cpu_map, - cpu_coregroup_mask(i)); - sd->parent = p; - p->child = sd; - cpu_to_core_group(i, cpu_map, &sd->groups, d.tmpmask); -#endif + sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); #ifdef CONFIG_SCHED_SMT p = sd; From d81735355533cd4b2bce9508d86fcad24a38cf47 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:57:03 +0200 Subject: [PATCH 30/64] sched: Separate out build of SMT sched domain from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105703.GG29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5c829d4ba8f..2ecec06e3f0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8546,6 +8546,23 @@ static struct sched_domain *__build_mc_sched_domain(struct s_data *d, return sd; } +static struct sched_domain *__build_smt_sched_domain(struct s_data *d, + const struct cpumask *cpu_map, struct sched_domain_attr *attr, + struct sched_domain *parent, int i) +{ + struct sched_domain *sd = parent; +#ifdef CONFIG_SCHED_SMT + sd = &per_cpu(cpu_domains, i).sd; + SD_INIT(sd, SIBLING); + set_domain_attribute(sd, attr); + cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i)); + sd->parent = parent; + parent->child = sd; + cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask); +#endif + return sd; +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8569,7 +8586,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, * Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd = NULL, *p; + struct sched_domain *sd; cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); @@ -8577,18 +8594,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_numa_sched_domains(&d, cpu_map, attr, i); sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); - -#ifdef CONFIG_SCHED_SMT - p = sd; - sd = &per_cpu(cpu_domains, i).sd; - SD_INIT(sd, SIBLING); - set_domain_attribute(sd, attr); - cpumask_and(sched_domain_span(sd), - topology_thread_cpumask(i), cpu_map); - sd->parent = p; - p->child = sd; - cpu_to_cpu_group(i, cpu_map, &sd->groups, d.tmpmask); -#endif + sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } #ifdef CONFIG_SCHED_SMT From 0e8e85c941d8f1b43bcc2e3b8b7026cdae476c53 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:57:51 +0200 Subject: [PATCH 31/64] sched: Separate out build of SMT sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105751.GH29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 2ecec06e3f0..43cfc6e54e9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8563,6 +8563,25 @@ static struct sched_domain *__build_smt_sched_domain(struct s_data *d, return sd; } +static void build_sched_groups(struct s_data *d, enum sched_domain_level l, + const struct cpumask *cpu_map, int cpu) +{ + switch (l) { +#ifdef CONFIG_SCHED_SMT + case SD_LV_SIBLING: /* set up CPU (sibling) groups */ + cpumask_and(d->this_sibling_map, cpu_map, + topology_thread_cpumask(cpu)); + if (cpu == cpumask_first(d->this_sibling_map)) + init_sched_build_groups(d->this_sibling_map, cpu_map, + &cpu_to_cpu_group, + d->send_covered, d->tmpmask); + break; +#endif + default: + break; + } +} + /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus @@ -8597,19 +8616,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); } -#ifdef CONFIG_SCHED_SMT - /* Set up CPU (sibling) groups */ for_each_cpu(i, cpu_map) { - cpumask_and(d.this_sibling_map, - topology_thread_cpumask(i), cpu_map); - if (i != cpumask_first(d.this_sibling_map)) - continue; - - init_sched_build_groups(d.this_sibling_map, cpu_map, - &cpu_to_cpu_group, - d.send_covered, d.tmpmask); + build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); } -#endif #ifdef CONFIG_SCHED_MC /* Set up multi-core groups */ From a2af04cdbb748158043e31799b28c48272081600 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:58:38 +0200 Subject: [PATCH 32/64] sched: Separate out build of MC sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105838.GI29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 43cfc6e54e9..f2c202f6629 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8576,6 +8576,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, &cpu_to_cpu_group, d->send_covered, d->tmpmask); break; +#endif +#ifdef CONFIG_SCHED_MC + case SD_LV_MC: /* set up multi-core groups */ + cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu)); + if (cpu == cpumask_first(d->this_core_map)) + init_sched_build_groups(d->this_core_map, cpu_map, + &cpu_to_core_group, + d->send_covered, d->tmpmask); + break; #endif default: break; @@ -8618,21 +8627,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, for_each_cpu(i, cpu_map) { build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); + build_sched_groups(&d, SD_LV_MC, cpu_map, i); } -#ifdef CONFIG_SCHED_MC - /* Set up multi-core groups */ - for_each_cpu(i, cpu_map) { - cpumask_and(d.this_core_map, cpu_coregroup_mask(i), cpu_map); - if (i != cpumask_first(d.this_core_map)) - continue; - - init_sched_build_groups(d.this_core_map, cpu_map, - &cpu_to_core_group, - d.send_covered, d.tmpmask); - } -#endif - /* Set up physical groups */ for (i = 0; i < nr_node_ids; i++) { cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); From 86548096f252bfe2065f1ea2d301e7319a16375d Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 12:59:28 +0200 Subject: [PATCH 33/64] sched: Separate out build of CPU sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818105928.GJ29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index f2c202f6629..b09a41c93ae 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8586,6 +8586,13 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, d->send_covered, d->tmpmask); break; #endif + case SD_LV_CPU: /* set up physical groups */ + cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); + if (!cpumask_empty(d->nodemask)) + init_sched_build_groups(d->nodemask, cpu_map, + &cpu_to_phys_group, + d->send_covered, d->tmpmask); + break; default: break; } @@ -8631,15 +8638,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, } /* Set up physical groups */ - for (i = 0; i < nr_node_ids; i++) { - cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(d.nodemask)) - continue; - - init_sched_build_groups(d.nodemask, cpu_map, - &cpu_to_phys_group, - d.send_covered, d.tmpmask); - } + for (i = 0; i < nr_node_ids; i++) + build_sched_groups(&d, SD_LV_CPU, cpu_map, i); #ifdef CONFIG_NUMA /* Set up node groups */ From de616e36c700dc312d9021dd75f769c463f85122 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 13:00:13 +0200 Subject: [PATCH 34/64] sched: Separate out build of ALLNODES sched groups from __build_sched_domains For the sake of completeness. Now all calls to init_sched_build_groups() are contained in build_sched_groups(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818110013.GK29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index b09a41c93ae..52c1953bc41 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8593,6 +8593,12 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, &cpu_to_phys_group, d->send_covered, d->tmpmask); break; +#ifdef CONFIG_NUMA + case SD_LV_ALLNODES: + init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group, + d->send_covered, d->tmpmask); + break; +#endif default: break; } @@ -8643,11 +8649,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, #ifdef CONFIG_NUMA /* Set up node groups */ - if (d.sd_allnodes) { - init_sched_build_groups(cpu_map, cpu_map, - &cpu_to_allnodes_group, - d.send_covered, d.tmpmask); - } + if (d.sd_allnodes) + build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); for (i = 0; i < nr_node_ids; i++) { /* Set up node groups */ From 0601a88d8fa4508eaa49a6d96c6685e1dece38e3 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 13:01:11 +0200 Subject: [PATCH 35/64] sched: Separate out build of NUMA sched groups from __build_sched_domains ... to further strip down __build_sched_domains(). Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818110111.GL29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 130 +++++++++++++++++++++++++------------------------ 1 file changed, 67 insertions(+), 63 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 52c1953bc41..c1ce884a016 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8246,6 +8246,71 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) sg = sg->next; } while (sg != group_head); } + +static int build_numa_sched_groups(struct s_data *d, + const struct cpumask *cpu_map, int num) +{ + struct sched_domain *sd; + struct sched_group *sg, *prev; + int n, j; + + cpumask_clear(d->covered); + cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map); + if (cpumask_empty(d->nodemask)) { + d->sched_group_nodes[num] = NULL; + goto out; + } + + sched_domain_node_span(num, d->domainspan); + cpumask_and(d->domainspan, d->domainspan, cpu_map); + + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, num); + if (!sg) { + printk(KERN_WARNING "Can not alloc domain group for node %d\n", + num); + return -ENOMEM; + } + d->sched_group_nodes[num] = sg; + + for_each_cpu(j, d->nodemask) { + sd = &per_cpu(node_domains, j).sd; + sd->groups = sg; + } + + sg->__cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), d->nodemask); + sg->next = sg; + cpumask_or(d->covered, d->covered, d->nodemask); + + prev = sg; + for (j = 0; j < nr_node_ids; j++) { + n = (num + j) % nr_node_ids; + cpumask_complement(d->notcovered, d->covered); + cpumask_and(d->tmpmask, d->notcovered, cpu_map); + cpumask_and(d->tmpmask, d->tmpmask, d->domainspan); + if (cpumask_empty(d->tmpmask)) + break; + cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n)); + if (cpumask_empty(d->tmpmask)) + continue; + sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, num); + if (!sg) { + printk(KERN_WARNING + "Can not alloc domain group for node %d\n", j); + return -ENOMEM; + } + sg->__cpu_power = 0; + cpumask_copy(sched_group_cpus(sg), d->tmpmask); + sg->next = prev->next; + cpumask_or(d->covered, d->covered, d->tmpmask); + prev->next = sg; + prev = sg; + } +out: + return 0; +} #endif /* CONFIG_NUMA */ #ifdef CONFIG_NUMA @@ -8652,70 +8717,9 @@ static int __build_sched_domains(const struct cpumask *cpu_map, if (d.sd_allnodes) build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0); - for (i = 0; i < nr_node_ids; i++) { - /* Set up node groups */ - struct sched_group *sg, *prev; - int j; - - cpumask_clear(d.covered); - cpumask_and(d.nodemask, cpumask_of_node(i), cpu_map); - if (cpumask_empty(d.nodemask)) { - d.sched_group_nodes[i] = NULL; - continue; - } - - sched_domain_node_span(i, d.domainspan); - cpumask_and(d.domainspan, d.domainspan, cpu_map); - - sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING "Can not alloc domain group for " - "node %d\n", i); + for (i = 0; i < nr_node_ids; i++) + if (build_numa_sched_groups(&d, cpu_map, i)) goto error; - } - d.sched_group_nodes[i] = sg; - for_each_cpu(j, d.nodemask) { - struct sched_domain *sd; - - sd = &per_cpu(node_domains, j).sd; - sd->groups = sg; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d.nodemask); - sg->next = sg; - cpumask_or(d.covered, d.covered, d.nodemask); - prev = sg; - - for (j = 0; j < nr_node_ids; j++) { - int n = (i + j) % nr_node_ids; - - cpumask_complement(d.notcovered, d.covered); - cpumask_and(d.tmpmask, d.notcovered, cpu_map); - cpumask_and(d.tmpmask, d.tmpmask, d.domainspan); - if (cpumask_empty(d.tmpmask)) - break; - - cpumask_and(d.tmpmask, d.tmpmask, cpumask_of_node(n)); - if (cpumask_empty(d.tmpmask)) - continue; - - sg = kmalloc_node(sizeof(struct sched_group) + - cpumask_size(), - GFP_KERNEL, i); - if (!sg) { - printk(KERN_WARNING - "Can not alloc domain group for node %d\n", j); - goto error; - } - sg->__cpu_power = 0; - cpumask_copy(sched_group_cpus(sg), d.tmpmask); - sg->next = prev->next; - cpumask_or(d.covered, d.covered, d.tmpmask); - prev->next = sg; - prev = sg; - } - } #endif /* Calculate CPU power for physical packages and nodes */ From 294b0c9619a0469a3b385b6fc47e79f64222a692 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 18 Aug 2009 13:02:29 +0200 Subject: [PATCH 36/64] sched: Consolidate definition of variable sd in __build_sched_domains Signed-off-by: Andreas Herrmann Cc: Peter Zijlstra LKML-Reference: <20090818110229.GM29515@alberich.amd.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index c1ce884a016..cf4c953d648 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8678,6 +8678,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map, { enum s_alloc alloc_state = sa_none; struct s_data d; + struct sched_domain *sd; int i; #ifdef CONFIG_NUMA d.sd_allnodes = 0; @@ -8692,8 +8693,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, * Set up domains for cpus specified by the cpu_map. */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd; - cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)), cpu_map); @@ -8725,22 +8724,19 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Calculate CPU power for physical packages and nodes */ #ifdef CONFIG_SCHED_SMT for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(cpu_domains, i).sd; - + sd = &per_cpu(cpu_domains, i).sd; init_sched_groups_power(i, sd); } #endif #ifdef CONFIG_SCHED_MC for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(core_domains, i).sd; - + sd = &per_cpu(core_domains, i).sd; init_sched_groups_power(i, sd); } #endif for_each_cpu(i, cpu_map) { - struct sched_domain *sd = &per_cpu(phys_domains, i).sd; - + sd = &per_cpu(phys_domains, i).sd; init_sched_groups_power(i, sd); } @@ -8759,7 +8755,6 @@ static int __build_sched_domains(const struct cpumask *cpu_map, /* Attach the domains */ for_each_cpu(i, cpu_map) { - struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i).sd; #elif defined(CONFIG_SCHED_MC) From cde7e5ca4e329a157108769d1f752d191cbb71c6 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 18 Aug 2009 13:01:01 +0900 Subject: [PATCH 37/64] sched: Use for_each_class macro in move_one_task() Replace for loop with the macro for_each_class to cleanup. Signed-off-by: Hiroshi Shimamoto LKML-Reference: <4A8A277D.4090304@ct.jp.nec.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 7f83be35d65..1b529efe887 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3461,9 +3461,10 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, { const struct sched_class *class; - for (class = sched_class_highest; class; class = class->next) + for_each_class(class) { if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle)) return 1; + } return 0; } From a8af7246c114bfd939e539f9566b872c06f6225c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 21 Aug 2009 13:58:54 +0200 Subject: [PATCH 38/64] sched: Avoid division by zero Patch a5004278f0525dcb9aa43703ef77bf371ea837cd (sched: Fix cgroup smp fairness) introduced the possibility of a divide-by-zero because load-balancing is not synchronized between sched_domains. This can cause the state of cpus to change between the first and second loop over the sched domain in tg_shares_up(). Reported-by: Yinghai Lu Signed-off-by: Peter Zijlstra Cc: Jes Sorensen Cc: Jens Axboe Cc: Linus Torvalds LKML-Reference: <1250855934.7538.30.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 1b529efe887..8f8a98eab9d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1522,7 +1522,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); */ static void update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight) + unsigned long sd_shares, unsigned long sd_rq_weight, + unsigned long sd_eff_weight) { unsigned long rq_weight; unsigned long shares; @@ -1535,13 +1536,15 @@ update_group_shares_cpu(struct task_group *tg, int cpu, if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; + if (sd_rq_weight == sd_eff_weight) + sd_eff_weight += NICE_0_LOAD; + sd_rq_weight = sd_eff_weight; } /* - * \Sum shares * rq_weight - * shares = ----------------------- - * \Sum rq_weight - * + * \Sum_j shares_j * rq_weight_i + * shares_i = ----------------------------- + * \Sum_j rq_weight_j */ shares = (sd_shares * rq_weight) / sd_rq_weight; shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); @@ -1593,14 +1596,8 @@ static int tg_shares_up(struct task_group *tg, void *data) if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) shares = tg->shares; - for_each_cpu(i, sched_domain_span(sd)) { - unsigned long sd_rq_weight = rq_weight; - - if (!tg->cfs_rq[i]->rq_weight) - sd_rq_weight = eff_weight; - - update_group_shares_cpu(tg, i, shares, sd_rq_weight); - } + for_each_cpu(i, sched_domain_span(sd)) + update_group_shares_cpu(tg, i, shares, rq_weight, eff_weight); return 0; } From fa84e9eecfff478df2d00e94deb3fc40fe4634ad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 21 Aug 2009 22:01:12 +0200 Subject: [PATCH 39/64] init: Move sched_clock_init after late_time_init Some architectures initialize clocks and timers in late_time_init and x86 wants to do the same to avoid FIXMAP hackery for calibrating the TSC. That would result in undefined sched_clock readout and wreckaged printk timestamps again. We probably have those already on archs which do all their time/clock setup in late_time_init. There is no harm to move that after late_time_init except that a few more boot timestamps are stale. The scheduler is not active at that point so no real wreckage is expected. Signed-off-by: Thomas Gleixner LKML-Reference: Cc: linux-arch@vger.kernel.org --- init/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/main.c b/init/main.c index 11f4f145be3..0ec75ce771a 100644 --- a/init/main.c +++ b/init/main.c @@ -631,7 +631,6 @@ asmlinkage void __init start_kernel(void) softirq_init(); timekeeping_init(); time_init(); - sched_clock_init(); profile_init(); if (!irqs_disabled()) printk(KERN_CRIT "start_kernel(): bug: interrupts were " @@ -682,6 +681,7 @@ asmlinkage void __init start_kernel(void) numa_policy_init(); if (late_time_init) late_time_init(); + sched_clock_init(); calibrate_delay(); pidmap_init(); anon_vma_init(); From 34d76c41554a05425613d16efebb3069c4c545f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 27 Aug 2009 13:08:56 +0200 Subject: [PATCH 40/64] sched: Fix division by zero - really When re-computing the shares for each task group's cpu representation we need the ratio of weight on each cpu vs the total weight of the sched domain. Since load-balancing is loosely (read not) synchronized, the weight of individual cpus can change between doing the sum and calculating the ratio. The previous patch dealt with only one of the race scenarios, this patch side steps them all by saving a snapshot of all the individual cpu weights, thereby always working on a consistent set. Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Cc: jes@sgi.com Cc: jens.axboe@oracle.com Cc: Balbir Singh Cc: Arjan van de Ven Cc: Yinghai Lu LKML-Reference: <1251371336.18584.77.camel@twins> Signed-off-by: Ingo Molnar --- kernel/sched.c | 50 +++++++++++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 21 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 8f8a98eab9d..523e20a6269 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1515,30 +1515,29 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED +struct update_shares_data { + unsigned long rq_weight[NR_CPUS]; +}; + +static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); + static void __set_se_shares(struct sched_entity *se, unsigned long shares); /* * Calculate and set the cpu's group shares. */ -static void -update_group_shares_cpu(struct task_group *tg, int cpu, - unsigned long sd_shares, unsigned long sd_rq_weight, - unsigned long sd_eff_weight) +static void update_group_shares_cpu(struct task_group *tg, int cpu, + unsigned long sd_shares, + unsigned long sd_rq_weight, + struct update_shares_data *usd) { - unsigned long rq_weight; - unsigned long shares; + unsigned long shares, rq_weight; int boost = 0; - if (!tg->se[cpu]) - return; - - rq_weight = tg->cfs_rq[cpu]->rq_weight; + rq_weight = usd->rq_weight[cpu]; if (!rq_weight) { boost = 1; rq_weight = NICE_0_LOAD; - if (sd_rq_weight == sd_eff_weight) - sd_eff_weight += NICE_0_LOAD; - sd_rq_weight = sd_eff_weight; } /* @@ -1555,6 +1554,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu, unsigned long flags; spin_lock_irqsave(&rq->lock, flags); + tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; tg->cfs_rq[cpu]->shares = boost ? 0 : shares; __set_se_shares(tg->se[cpu], shares); spin_unlock_irqrestore(&rq->lock, flags); @@ -1568,25 +1568,31 @@ update_group_shares_cpu(struct task_group *tg, int cpu, */ static int tg_shares_up(struct task_group *tg, void *data) { - unsigned long weight, rq_weight = 0, eff_weight = 0; - unsigned long shares = 0; + unsigned long weight, rq_weight = 0, shares = 0; + struct update_shares_data *usd; struct sched_domain *sd = data; + unsigned long flags; int i; + if (!tg->se[0]) + return 0; + + local_irq_save(flags); + usd = &__get_cpu_var(update_shares_data); + for_each_cpu(i, sched_domain_span(sd)) { + weight = tg->cfs_rq[i]->load.weight; + usd->rq_weight[i] = weight; + /* * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to * run here it will not get delayed by group starvation. */ - weight = tg->cfs_rq[i]->load.weight; - tg->cfs_rq[i]->rq_weight = weight; - rq_weight += weight; - if (!weight) weight = NICE_0_LOAD; - eff_weight += weight; + rq_weight += weight; shares += tg->cfs_rq[i]->shares; } @@ -1597,7 +1603,9 @@ static int tg_shares_up(struct task_group *tg, void *data) shares = tg->shares; for_each_cpu(i, sched_domain_span(sd)) - update_group_shares_cpu(tg, i, shares, rq_weight, eff_weight); + update_group_shares_cpu(tg, i, shares, rq_weight, usd); + + local_irq_restore(flags); return 0; } From 84e9dabf6e6a78928c6a1a8ba235c9fb0908d0f8 Mon Sep 17 00:00:00 2001 From: Anirban Sinha Date: Fri, 28 Aug 2009 22:40:43 -0700 Subject: [PATCH 41/64] sched: Rename init_cfs_rq => init_tg_cfs_rq ... so that it does not share a common name with a function within the same scope. Signed-off-by: Anirban Sinha LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 523e20a6269..6244d24cafc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -309,8 +309,8 @@ void set_tg_uid(struct user_struct *user) /* * Root task group. - * Every UID task group (including init_task_group aka UID-0) will - * be a child to this group. + * Every UID task group (including init_task_group aka UID-0) will + * be a child to this group. */ struct task_group root_task_group; @@ -318,7 +318,7 @@ struct task_group root_task_group; /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -9400,11 +9400,11 @@ void __init sched_init(void) * system cpu resource, based on the weight assigned to root * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished * by letting tasks of init_task_group sit in a separate cfs_rq - * (init_cfs_rq) and having one entity represent this group of + * (init_tg_cfs_rq) and having one entity represent this group of * tasks in rq->cfs (i.e init_task_group->se[] != NULL). */ init_tg_cfs_entry(&init_task_group, - &per_cpu(init_cfs_rq, i), + &per_cpu(init_tg_cfs_rq, i), &per_cpu(init_sched_entity, i), i, 1, root_task_group.se[i]); From 8f0dfc34e9b323a028c2ec41abb7e9de477b7a94 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 20 Jul 2009 11:26:58 -0700 Subject: [PATCH 42/64] sched: Provide iowait counters For counting how long an application has been waiting for (disk) IO, there currently is only the HZ sample driven information available, while for all other counters in this class, a high resolution version is available via CONFIG_SCHEDSTATS. In order to make an improved bootchart tool possible, we also need a higher resolution version of the iowait time. This patch below adds this scheduler statistic to the kernel. Signed-off-by: Arjan van de Ven Signed-off-by: Peter Zijlstra LKML-Reference: <4A64B813.1080506@linux.intel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/sched.c | 4 ++++ kernel/sched_debug.c | 4 ++++ kernel/sched_fair.c | 5 +++++ 4 files changed, 17 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index e209ae0e7a8..9c96ef2f7e6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1111,6 +1111,8 @@ struct sched_entity { u64 wait_max; u64 wait_count; u64 wait_sum; + u64 iowait_count; + u64 iowait_sum; u64 sleep_start; u64 sleep_max; @@ -1231,6 +1233,8 @@ struct task_struct { unsigned did_exec:1; unsigned in_execve:1; /* Tell the LSMs that the process is doing an * execve */ + unsigned in_iowait:1; + /* Revert to default priority/policy when forking */ unsigned sched_reset_on_fork:1; diff --git a/kernel/sched.c b/kernel/sched.c index 6244d24cafc..38d05a89e0f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6754,7 +6754,9 @@ void __sched io_schedule(void) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + current->in_iowait = 1; schedule(); + current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); } @@ -6767,7 +6769,9 @@ long __sched io_schedule_timeout(long timeout) delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); + current->in_iowait = 1; ret = schedule_timeout(timeout); + current->in_iowait = 0; atomic_dec(&rq->nr_iowait); delayacct_blkio_end(); return ret; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 70c7e0b7994..5ddbd089126 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -409,6 +409,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.wait_max); PN(se.wait_sum); P(se.wait_count); + PN(se.iowait_sum); + P(se.iowait_count); P(sched_info.bkl_count); P(se.nr_migrations); P(se.nr_migrations_cold); @@ -479,6 +481,8 @@ void proc_sched_set_task(struct task_struct *p) p->se.wait_max = 0; p->se.wait_sum = 0; p->se.wait_count = 0; + p->se.iowait_sum = 0; + p->se.iowait_count = 0; p->se.sleep_max = 0; p->se.sum_sleep_runtime = 0; p->se.block_max = 0; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 342000b31ad..471fa281f5e 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -652,6 +652,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sum_sleep_runtime += delta; if (tsk) { + if (tsk->in_iowait) { + se->iowait_sum += delta; + se->iowait_count++; + } + /* * Blocking time is in units of nanosecs, so shift by * 20 to get a milliseconds-range estimation of the From 768d0c27226e6587cad2fcf543f9711da3f3774e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 20:13:26 +0200 Subject: [PATCH 43/64] sched: Add wait, sleep and iowait accounting tracepoints Add 3 schedstat tracepoints to help account for wait-time, sleep-time and iowait-time. They can also be used as a perf-counter source to profile tasks on these clocks. Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Arjan van de Ven LKML-Reference: [ build fix for the !CONFIG_SCHEDSTATS case ] Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 95 ++++++++++++++++++++++++++++++++++++ kernel/sched_fair.c | 12 ++++- 2 files changed, 106 insertions(+), 1 deletion(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 8949bb7eb08..a4c369ec328 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -340,6 +340,101 @@ TRACE_EVENT(sched_signal_send, __entry->sig, __entry->comm, __entry->pid) ); +/* + * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE + * adding sched_stat support to SCHED_FIFO/RR would be welcome. + */ + +/* + * Tracepoint for accounting wait time (time the task is runnable + * but not actually running due to scheduler contention). + */ +TRACE_EVENT(sched_stat_wait, + + TP_PROTO(struct task_struct *tsk, u64 delay), + + TP_ARGS(tsk, delay), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, delay ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->delay = delay; + ) + TP_perf_assign( + __perf_count(delay); + ), + + TP_printk("task: %s:%d wait: %Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->delay) +); + +/* + * Tracepoint for accounting sleep time (time the task is not runnable, + * including iowait, see below). + */ +TRACE_EVENT(sched_stat_sleep, + + TP_PROTO(struct task_struct *tsk, u64 delay), + + TP_ARGS(tsk, delay), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, delay ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->delay = delay; + ) + TP_perf_assign( + __perf_count(delay); + ), + + TP_printk("task: %s:%d sleep: %Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->delay) +); + +/* + * Tracepoint for accounting iowait time (time the task is not runnable + * due to waiting on IO to complete). + */ +TRACE_EVENT(sched_stat_iowait, + + TP_PROTO(struct task_struct *tsk, u64 delay), + + TP_ARGS(tsk, delay), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( u64, delay ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->delay = delay; + ) + TP_perf_assign( + __perf_count(delay); + ), + + TP_printk("task: %s:%d iowait: %Lu [ns]", + __entry->comm, __entry->pid, + (unsigned long long)__entry->delay) +); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 471fa281f5e..2ff850f90d1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -546,6 +546,13 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) schedstat_set(se->wait_sum, se->wait_sum + rq_of(cfs_rq)->clock - se->wait_start); schedstat_set(se->wait_start, 0); + +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + trace_sched_stat_wait(task_of(se), + rq_of(cfs_rq)->clock - se->wait_start); + } +#endif } static inline void @@ -636,8 +643,10 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sleep_start = 0; se->sum_sleep_runtime += delta; - if (tsk) + if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; @@ -655,6 +664,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) if (tsk->in_iowait) { se->iowait_sum += delta; se->iowait_count++; + trace_sched_stat_iowait(tsk, delta); } /* From f93e65c186ab3c05ce2068733ca10e34fd00125e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:32 +0200 Subject: [PATCH 44/64] sched: Restore __cpu_power to a straight sum of power cpu_power is supposed to be a representation of the process capacity of the cpu, not a value to randomly tweak in order to affect placement. Remove the placement hacks. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083825.810860576@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index da1edc8277d..584a122b553 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8464,15 +8464,13 @@ static void free_sched_groups(const struct cpumask *cpu_map, * there are asymmetries in the topology. If there are asymmetries, group * having more cpu_power will pickup more load compared to the group having * less cpu_power. - * - * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents - * the maximum number of tasks a group can handle in the presence of other idle - * or lightly loaded groups in the same sched domain. */ static void init_sched_groups_power(int cpu, struct sched_domain *sd) { struct sched_domain *child; struct sched_group *group; + long power; + int weight; WARN_ON(!sd || !sd->groups); @@ -8483,22 +8481,20 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sd->groups->__cpu_power = 0; - /* - * For perf policy, if the groups in child domain share resources - * (for example cores sharing some portions of the cache hierarchy - * or SMT), then set this domain groups cpu_power such that each group - * can handle only one task, when there are other idle groups in the - * same sched domain. - */ - if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && - (child->flags & - (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { - sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); + if (!child) { + power = SCHED_LOAD_SCALE; + weight = cpumask_weight(sched_domain_span(sd)); + /* + * SMT siblings share the power of a single core. + */ + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) + power /= weight; + sg_inc_cpu_power(sd->groups, power); return; } /* - * add cpu_power of each child group to this groups cpu_power + * Add cpu_power of each child group to this groups cpu_power. */ group = child->groups; do { From b5d978e0c7e79a7ff842e895c85a86b38c71f1cd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:33 +0200 Subject: [PATCH 45/64] sched: Add SD_PREFER_SIBLING Do the placement thing using SD flags. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083825.897028974@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 29 +++++++++++++++-------------- kernel/sched.c | 14 +++++++++++++- 2 files changed, 28 insertions(+), 15 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c96ef2f7e6..651dded2572 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -798,18 +798,19 @@ enum cpu_idle_type { #define SCHED_LOAD_SCALE_FUZZ SCHED_LOAD_SCALE #ifdef CONFIG_SMP -#define SD_LOAD_BALANCE 1 /* Do load balancing on this domain. */ -#define SD_BALANCE_NEWIDLE 2 /* Balance when about to become idle */ -#define SD_BALANCE_EXEC 4 /* Balance on exec */ -#define SD_BALANCE_FORK 8 /* Balance on fork, clone */ -#define SD_WAKE_IDLE 16 /* Wake to idle CPU on task wakeup */ -#define SD_WAKE_AFFINE 32 /* Wake task to waking CPU */ -#define SD_WAKE_BALANCE 64 /* Perform balancing at task wakeup */ -#define SD_SHARE_CPUPOWER 128 /* Domain members share cpu power */ -#define SD_POWERSAVINGS_BALANCE 256 /* Balance for power savings */ -#define SD_SHARE_PKG_RESOURCES 512 /* Domain members share cpu pkg resources */ -#define SD_SERIALIZE 1024 /* Only a single load balancing instance */ -#define SD_WAKE_IDLE_FAR 2048 /* Gain latency sacrificing cache hit */ +#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */ +#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */ +#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */ +#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ +#define SD_WAKE_IDLE 0x0010 /* Wake to idle CPU on task wakeup */ +#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ +#define SD_WAKE_BALANCE 0x0040 /* Perform balancing at task wakeup */ +#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ +#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */ +#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ +#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ +#define SD_WAKE_IDLE_FAR 0x0800 /* Gain latency sacrificing cache hit */ +#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ enum powersavings_balance_level { POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */ @@ -829,7 +830,7 @@ static inline int sd_balance_for_mc_power(void) if (sched_smt_power_savings) return SD_POWERSAVINGS_BALANCE; - return 0; + return SD_PREFER_SIBLING; } static inline int sd_balance_for_package_power(void) @@ -837,7 +838,7 @@ static inline int sd_balance_for_package_power(void) if (sched_mc_power_savings | sched_smt_power_savings) return SD_POWERSAVINGS_BALANCE; - return 0; + return SD_PREFER_SIBLING; } /* diff --git a/kernel/sched.c b/kernel/sched.c index 584a122b553..9d64cec9ae1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3811,9 +3811,13 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, const struct cpumask *cpus, int *balance, struct sd_lb_stats *sds) { + struct sched_domain *child = sd->child; struct sched_group *group = sd->groups; struct sg_lb_stats sgs; - int load_idx; + int load_idx, prefer_sibling = 0; + + if (child && child->flags & SD_PREFER_SIBLING) + prefer_sibling = 1; init_sd_power_savings_stats(sd, sds, idle); load_idx = get_sd_load_idx(sd, idle); @@ -3833,6 +3837,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, sds->total_load += sgs.group_load; sds->total_pwr += group->__cpu_power; + /* + * In case the child domain prefers tasks go to siblings + * first, lower the group capacity to one so that we'll try + * and move all the excess tasks away. + */ + if (prefer_sibling) + sgs.group_capacity = 1; + if (local_group) { sds->this_load = sgs.avg_load; sds->this = group; From cc9fba7d7672fa3ed58d9d9ecb6c45b1351c29a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:34 +0200 Subject: [PATCH 46/64] sched: Update the cpu_power sum during load-balance In order to prepare for a more dynamic cpu_power, update the group sum while walking the sched domains during load-balance. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083825.985050292@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 9d64cec9ae1..ecb4a47d421 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3699,6 +3699,28 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static void update_sched_power(struct sched_domain *sd) +{ + struct sched_domain *child = sd->child; + struct sched_group *group, *sdg = sd->groups; + unsigned long power = sdg->__cpu_power; + + if (!child) { + /* compute cpu power for this cpu */ + return; + } + + sdg->__cpu_power = 0; + + group = child->groups; + do { + sdg->__cpu_power += group->__cpu_power; + group = group->next; + } while (group != child->groups); + + if (power != sdg->__cpu_power) + sdg->reciprocal_cpu_power = reciprocal_value(sdg->__cpu_power); +} /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. @@ -3712,7 +3734,8 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ -static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, +static inline void update_sg_lb_stats(struct sched_domain *sd, + struct sched_group *group, int this_cpu, enum cpu_idle_type idle, int load_idx, int *sd_idle, int local_group, const struct cpumask *cpus, int *balance, struct sg_lb_stats *sgs) @@ -3723,8 +3746,11 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu, unsigned long sum_avg_load_per_task; unsigned long avg_load_per_task; - if (local_group) + if (local_group) { balance_cpu = group_first_cpu(group); + if (balance_cpu == this_cpu) + update_sched_power(sd); + } /* Tally up the load of all CPUs in the group */ sum_avg_load_per_task = avg_load_per_task = 0; @@ -3828,7 +3854,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(group)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle, + update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle, local_group, cpus, balance, &sgs); if (local_group && balance && !(*balance)) @@ -3863,7 +3889,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, update_sd_power_savings_stats(group, sds, local_group, &sgs); group = group->next; } while (group != sd->groups); - } /** From a52bfd73589eaf88d9c95ad2c1de0b38a6b27972 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:35 +0200 Subject: [PATCH 47/64] sched: Add smt_gain The idea is that multi-threading a core yields more work capacity than a single thread, provide a way to express a static gain for threads. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.073345955@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + include/linux/topology.h | 1 + kernel/sched.c | 8 +++++++- 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 651dded2572..9c81c921acb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -921,6 +921,7 @@ struct sched_domain { unsigned int newidle_idx; unsigned int wake_idx; unsigned int forkexec_idx; + unsigned int smt_gain; int flags; /* See SD_* */ enum sched_domain_level level; diff --git a/include/linux/topology.h b/include/linux/topology.h index 7402c1a27c4..6203ae5067c 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -99,6 +99,7 @@ int arch_update_cpu_topology(void); | SD_SHARE_CPUPOWER, \ .last_balance = jiffies, \ .balance_interval = 1, \ + .smt_gain = 1178, /* 15% */ \ } #endif #endif /* CONFIG_SCHED_SMT */ diff --git a/kernel/sched.c b/kernel/sched.c index ecb4a47d421..55112261027 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8523,9 +8523,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) weight = cpumask_weight(sched_domain_span(sd)); /* * SMT siblings share the power of a single core. + * Usually multiple threads get a better yield out of + * that one core than a single thread would have, + * reflect that in sd->smt_gain. */ - if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { + power *= sd->smt_gain; power /= weight; + power >>= SCHED_LOAD_SHIFT; + } sg_inc_cpu_power(sd->groups, power); return; } From ab29230e673c646292c90c8b9d378b9562145af0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:36 +0200 Subject: [PATCH 48/64] sched: Implement dynamic cpu_power Recompute the cpu_power for each cpu during load-balance. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.162033479@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 55112261027..036600fd70b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3699,14 +3699,46 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -static void update_sched_power(struct sched_domain *sd) +unsigned long __weak arch_smt_gain(struct sched_domain *sd, int cpu) +{ + unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long smt_gain = sd->smt_gain; + + smt_gain /= weight; + + return smt_gain; +} + +static void update_cpu_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = cpumask_weight(sched_domain_span(sd)); + unsigned long power = SCHED_LOAD_SCALE; + struct sched_group *sdg = sd->groups; + unsigned long old = sdg->__cpu_power; + + /* here we could scale based on cpufreq */ + + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { + power *= arch_smt_gain(sd, cpu); + power >>= SCHED_LOAD_SHIFT; + } + + /* here we could scale based on RT time */ + + if (power != old) { + sdg->__cpu_power = power; + sdg->reciprocal_cpu_power = reciprocal_value(power); + } +} + +static void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; unsigned long power = sdg->__cpu_power; if (!child) { - /* compute cpu power for this cpu */ + update_cpu_power(sd, cpu); return; } @@ -3749,7 +3781,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if (local_group) { balance_cpu = group_first_cpu(group); if (balance_cpu == this_cpu) - update_sched_power(sd); + update_group_power(sd, this_cpu); } /* Tally up the load of all CPUs in the group */ From e9e9250bc78e7f6342517214c0178a529807964b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:37 +0200 Subject: [PATCH 49/64] sched: Scale down cpu_power due to RT tasks Keep an average on the amount of time spend on RT tasks and use that fraction to scale down the cpu_power for regular tasks. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.287778431@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched.c | 64 +++++++++++++++++++++++++++++++++++++++++-- kernel/sched_rt.c | 6 ++-- kernel/sysctl.c | 8 ++++++ 4 files changed, 72 insertions(+), 7 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 9c81c921acb..c67ddf309c8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1831,6 +1831,7 @@ extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; +extern unsigned int sysctl_sched_time_avg; extern unsigned int sysctl_timer_migration; int sched_nr_latency_handler(struct ctl_table *table, int write, diff --git a/kernel/sched.c b/kernel/sched.c index 036600fd70b..ab532b5de40 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -627,6 +627,9 @@ struct rq { struct task_struct *migration_thread; struct list_head migration_queue; + + u64 rt_avg; + u64 age_stamp; #endif /* calc_load related fields */ @@ -862,6 +865,14 @@ unsigned int sysctl_sched_shares_ratelimit = 250000; */ unsigned int sysctl_sched_shares_thresh = 4; +/* + * period over which we average the RT time consumption, measured + * in ms. + * + * default: 1s + */ +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; + /* * period over which we measure -rt task cpu usage in us. * default: 1s @@ -1280,12 +1291,37 @@ void wake_up_idle_cpu(int cpu) } #endif /* CONFIG_NO_HZ */ +static u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +static void sched_avg_update(struct rq *rq) +{ + s64 period = sched_avg_period(); + + while ((s64)(rq->clock - rq->age_stamp) > period) { + rq->age_stamp += period; + rq->rt_avg /= 2; + } +} + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta; + sched_avg_update(rq); +} + #else /* !CONFIG_SMP */ static void resched_task(struct task_struct *p) { assert_spin_locked(&task_rq(p)->lock); set_tsk_need_resched(p); } + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ +} #endif /* CONFIG_SMP */ #if BITS_PER_LONG == 32 @@ -3699,7 +3735,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, } #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -unsigned long __weak arch_smt_gain(struct sched_domain *sd, int cpu) +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) { unsigned long weight = cpumask_weight(sched_domain_span(sd)); unsigned long smt_gain = sd->smt_gain; @@ -3709,6 +3745,24 @@ unsigned long __weak arch_smt_gain(struct sched_domain *sd, int cpu) return smt_gain; } +unsigned long scale_rt_power(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 total, available; + + sched_avg_update(rq); + + total = sched_avg_period() + (rq->clock - rq->age_stamp); + available = total - rq->rt_avg; + + if (unlikely((s64)total < SCHED_LOAD_SCALE)) + total = SCHED_LOAD_SCALE; + + total >>= SCHED_LOAD_SHIFT; + + return div_u64(available, total); +} + static void update_cpu_power(struct sched_domain *sd, int cpu) { unsigned long weight = cpumask_weight(sched_domain_span(sd)); @@ -3719,11 +3773,15 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) /* here we could scale based on cpufreq */ if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { - power *= arch_smt_gain(sd, cpu); + power *= arch_scale_smt_power(sd, cpu); power >>= SCHED_LOAD_SHIFT; } - /* here we could scale based on RT time */ + power *= scale_rt_power(cpu); + power >>= SCHED_LOAD_SHIFT; + + if (!power) + power = 1; if (power != old) { sdg->__cpu_power = power; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 3d4020a9ba1..2eb4bd6a526 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -615,6 +615,8 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); + sched_rt_avg_update(rq, delta_exec); + if (!rt_bandwidth_enabled()) return; @@ -887,8 +889,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); - - inc_cpu_load(rq, p->se.load.weight); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -899,8 +899,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) dequeue_rt_entity(rt_se); dequeue_pushable_task(rq, p); - - dec_cpu_load(rq, p->se.load.weight); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 58be76017fd..6c9836ef4b4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -330,6 +330,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_time_avg", + .data = &sysctl_sched_time_avg, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "timer_migration", From bdb94aa5dbd8b55e75f5a50b61312fe589e2c2d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:38 +0200 Subject: [PATCH 50/64] sched: Try to deal with low capacity When the capacity drops low, we want to migrate load away. Allow the load-balancer to remove all tasks when we hit rock bottom. Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.342231003@chello.nl> Signed-off-by: Ingo Molnar --- kernel/sched.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index ab532b5de40..5f5b359b01b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3908,8 +3908,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) sgs->group_imb = 1; - sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; - + sgs->group_capacity = + DIV_ROUND_CLOSEST(group->__cpu_power, SCHED_LOAD_SCALE); } /** @@ -3959,7 +3959,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, * and move all the excess tasks away. */ if (prefer_sibling) - sgs.group_capacity = 1; + sgs.group_capacity = min(sgs.group_capacity, 1UL); if (local_group) { sds->this_load = sgs.avg_load; @@ -4191,6 +4191,26 @@ ret: return NULL; } +static struct sched_group *group_of(int cpu) +{ + struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + + if (!sd) + return NULL; + + return sd->groups; +} + +static unsigned long power_of(int cpu) +{ + struct sched_group *group = group_of(cpu); + + if (!group) + return SCHED_LOAD_SCALE; + + return group->__cpu_power; +} + /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ @@ -4203,15 +4223,18 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, int i; for_each_cpu(i, sched_group_cpus(group)) { + unsigned long power = power_of(i); + unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); unsigned long wl; if (!cpumask_test_cpu(i, cpus)) continue; rq = cpu_rq(i); - wl = weighted_cpuload(i); + wl = weighted_cpuload(i) * SCHED_LOAD_SCALE; + wl /= power; - if (rq->nr_running == 1 && wl > imbalance) + if (capacity && rq->nr_running == 1 && wl > imbalance) continue; if (wl > max_load) { From d899a789c28ded9c72b57ddb61868d6b8fc23e80 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Wed, 2 Sep 2009 16:59:10 +0530 Subject: [PATCH 51/64] sched: Try to deal with low capacity, fix update_sd_power_savings_stats() sgs.group_capacity can now be 0, if for some reason group->__cpu_power happens to be less than SCHED_LOAD_SCALE/2. In that case, we need the following fix to make it work for update_sd_power_savings_stats(). That's because both sum_nr_running and group_capacity are unsigned longs. Cc: Gautham R Shenoy Cc: Peter Zijlstra Cc: Andreas Herrmann Cc: Balbir Singh Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5f5b359b01b..e1ebf9b00f5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3668,7 +3668,7 @@ static inline void update_sd_power_savings_stats(struct sched_group *group, * capacity but still has some space to pick up some load * from other group and save more power */ - if (sgs->sum_nr_running > sgs->group_capacity - 1) + if (sgs->sum_nr_running + 1 > sgs->group_capacity) return; if (sgs->sum_nr_running > sds->leader_nr_running || From 18a3885fc1ffa92c2212ff0afdf033403d5b0fa0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 1 Sep 2009 10:34:39 +0200 Subject: [PATCH 52/64] sched: Remove reciprocal for cpu_power Its a source of fail, also, now that cpu_power is dynamical, its a waste of time. before: -0 [000] 132.877936: find_busiest_group: avg_load: 0 group_load: 8241 power: 1 after: bash-1689 [001] 137.862151: find_busiest_group: avg_load: 10636288 group_load: 10387 power: 1 [ v2: build fix from From: Andreas Herrmann ] Signed-off-by: Peter Zijlstra Tested-by: Andreas Herrmann Acked-by: Andreas Herrmann Acked-by: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: <20090901083826.425896304@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 10 +---- kernel/sched.c | 101 ++++++++++++++---------------------------- 2 files changed, 36 insertions(+), 75 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index c67ddf309c8..3b7f43e3b73 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -860,15 +860,9 @@ struct sched_group { /* * CPU power of this group, SCHED_LOAD_SCALE being max power for a - * single CPU. This is read only (except for setup, hotplug CPU). - * Note : Never change cpu_power without recompute its reciprocal + * single CPU. */ - unsigned int __cpu_power; - /* - * reciprocal value of cpu_power to avoid expensive divides - * (see include/linux/reciprocal_div.h) - */ - u32 reciprocal_cpu_power; + unsigned int cpu_power; /* * The CPUs this group covers. diff --git a/kernel/sched.c b/kernel/sched.c index e1ebf9b00f5..b5378534685 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -64,7 +64,6 @@ #include #include #include -#include #include #include #include @@ -120,30 +119,8 @@ */ #define RUNTIME_INF ((u64)~0ULL) -#ifdef CONFIG_SMP - static void double_rq_lock(struct rq *rq1, struct rq *rq2); -/* - * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) - * Since cpu_power is a 'constant', we can use a reciprocal divide. - */ -static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) -{ - return reciprocal_divide(load, sg->reciprocal_cpu_power); -} - -/* - * Each time a sched group cpu_power is changed, - * we must compute its reciprocal value - */ -static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) -{ - sg->__cpu_power += val; - sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); -} -#endif - static inline int rt_policy(int policy) { if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -2335,8 +2312,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) } /* Adjust by relative CPU power of the group */ - avg_load = sg_div_cpu_power(group, - avg_load * SCHED_LOAD_SCALE); + avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; if (local_group) { this_load = avg_load; @@ -3768,7 +3744,6 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) unsigned long weight = cpumask_weight(sched_domain_span(sd)); unsigned long power = SCHED_LOAD_SCALE; struct sched_group *sdg = sd->groups; - unsigned long old = sdg->__cpu_power; /* here we could scale based on cpufreq */ @@ -3783,33 +3758,26 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) if (!power) power = 1; - if (power != old) { - sdg->__cpu_power = power; - sdg->reciprocal_cpu_power = reciprocal_value(power); - } + sdg->cpu_power = power; } static void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long power = sdg->__cpu_power; if (!child) { update_cpu_power(sd, cpu); return; } - sdg->__cpu_power = 0; + sdg->cpu_power = 0; group = child->groups; do { - sdg->__cpu_power += group->__cpu_power; + sdg->cpu_power += group->cpu_power; group = group->next; } while (group != child->groups); - - if (power != sdg->__cpu_power) - sdg->reciprocal_cpu_power = reciprocal_value(sdg->__cpu_power); } /** @@ -3889,8 +3857,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, } /* Adjust by relative CPU power of the group */ - sgs->avg_load = sg_div_cpu_power(group, - sgs->group_load * SCHED_LOAD_SCALE); + sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power; /* @@ -3902,14 +3869,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd, * normalized nr_running number somewhere that negates * the hierarchy? */ - avg_load_per_task = sg_div_cpu_power(group, - sum_avg_load_per_task * SCHED_LOAD_SCALE); + avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) / + group->cpu_power; if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) sgs->group_imb = 1; sgs->group_capacity = - DIV_ROUND_CLOSEST(group->__cpu_power, SCHED_LOAD_SCALE); + DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE); } /** @@ -3951,7 +3918,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, return; sds->total_load += sgs.group_load; - sds->total_pwr += group->__cpu_power; + sds->total_pwr += group->cpu_power; /* * In case the child domain prefers tasks go to siblings @@ -4016,28 +3983,28 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds, * moving them. */ - pwr_now += sds->busiest->__cpu_power * + pwr_now += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->__cpu_power * + pwr_now += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = sg_div_cpu_power(sds->busiest, - sds->busiest_load_per_task * SCHED_LOAD_SCALE); + tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + sds->busiest->cpu_power; if (sds->max_load > tmp) - pwr_move += sds->busiest->__cpu_power * + pwr_move += sds->busiest->cpu_power * min(sds->busiest_load_per_task, sds->max_load - tmp); /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->__cpu_power < + if (sds->max_load * sds->busiest->cpu_power < sds->busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = sg_div_cpu_power(sds->this, - sds->max_load * sds->busiest->__cpu_power); + tmp = (sds->max_load * sds->busiest->cpu_power) / + sds->this->cpu_power; else - tmp = sg_div_cpu_power(sds->this, - sds->busiest_load_per_task * SCHED_LOAD_SCALE); - pwr_move += sds->this->__cpu_power * + tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) / + sds->this->cpu_power; + pwr_move += sds->this->cpu_power * min(sds->this_load_per_task, sds->this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; @@ -4072,8 +4039,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, sds->max_load - sds->busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * sds->busiest->__cpu_power, - (sds->avg_load - sds->this_load) * sds->this->__cpu_power) + *imbalance = min(max_pull * sds->busiest->cpu_power, + (sds->avg_load - sds->this_load) * sds->this->cpu_power) / SCHED_LOAD_SCALE; /* @@ -4208,7 +4175,7 @@ static unsigned long power_of(int cpu) if (!group) return SCHED_LOAD_SCALE; - return group->__cpu_power; + return group->cpu_power; } /* @@ -7922,7 +7889,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - if (!group->__cpu_power) { + if (!group->cpu_power) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -7946,9 +7913,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); printk(KERN_CONT " %s", str); - if (group->__cpu_power != SCHED_LOAD_SCALE) { - printk(KERN_CONT " (__cpu_power = %d)", - group->__cpu_power); + if (group->cpu_power != SCHED_LOAD_SCALE) { + printk(KERN_CONT " (cpu_power = %d)", + group->cpu_power); } group = group->next; @@ -8233,7 +8200,7 @@ init_sched_build_groups(const struct cpumask *span, continue; cpumask_clear(sched_group_cpus(sg)); - sg->__cpu_power = 0; + sg->cpu_power = 0; for_each_cpu(j, span) { if (group_fn(j, cpu_map, NULL, tmpmask) != group) @@ -8491,7 +8458,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) continue; } - sg_inc_cpu_power(sg, sd->groups->__cpu_power); + sg->cpu_power += sd->groups->cpu_power; } sg = sg->next; } while (sg != group_head); @@ -8528,7 +8495,7 @@ static int build_numa_sched_groups(struct s_data *d, sd->groups = sg; } - sg->__cpu_power = 0; + sg->cpu_power = 0; cpumask_copy(sched_group_cpus(sg), d->nodemask); sg->next = sg; cpumask_or(d->covered, d->covered, d->nodemask); @@ -8551,7 +8518,7 @@ static int build_numa_sched_groups(struct s_data *d, "Can not alloc domain group for node %d\n", j); return -ENOMEM; } - sg->__cpu_power = 0; + sg->cpu_power = 0; cpumask_copy(sched_group_cpus(sg), d->tmpmask); sg->next = prev->next; cpumask_or(d->covered, d->covered, d->tmpmask); @@ -8629,7 +8596,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) child = sd->child; - sd->groups->__cpu_power = 0; + sd->groups->cpu_power = 0; if (!child) { power = SCHED_LOAD_SCALE; @@ -8645,7 +8612,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) power /= weight; power >>= SCHED_LOAD_SHIFT; } - sg_inc_cpu_power(sd->groups, power); + sd->groups->cpu_power += power; return; } @@ -8654,7 +8621,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) */ group = child->groups; do { - sg_inc_cpu_power(sd->groups, group->__cpu_power); + sd->groups->cpu_power += group->cpu_power; group = group->next; } while (group != child->groups); } From d7ea17a76916e456fcc78e45142c66f7fb875e3d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 4 Sep 2009 11:49:25 +0200 Subject: [PATCH 53/64] sched: Fix dynamic power-balancing crash This crash: [ 1774.088275] divide error: 0000 [#1] SMP [ 1774.100355] CPU 13 [ 1774.102498] Modules linked in: [ 1774.105631] Pid: 30881, comm: hackbench Not tainted 2.6.31-rc8-tip-01308-g484d664-dirty #1629 X8DTN [ 1774.114807] RIP: 0010:[] [] sched_balance_self+0x19b/0x2d4 Triggers because update_group_power() modifies the sd tree and does temporary calculations there - not considering that other CPUs could observe intermediate values, such as the zero initial value. Calculate it in a temporary variable instead. (we need no memory barrier as these are all statistical values anyway) Acked-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20090904092742.GA11014@elte.hu> Signed-off-by: Ingo Molnar --- kernel/sched.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index b5378534685..796baf73197 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3765,19 +3765,22 @@ static void update_group_power(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; + unsigned long power; if (!child) { update_cpu_power(sd, cpu); return; } - sdg->cpu_power = 0; + power = 0; group = child->groups; do { - sdg->cpu_power += group->cpu_power; + power += group->cpu_power; group = group->next; } while (group != child->groups); + + sdg->cpu_power = power; } /** From 47734f89be0614b5acbd6a532390f9c72f019648 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 4 Sep 2009 11:21:24 +0200 Subject: [PATCH 54/64] sched: Clean up topology.h Re-organize the flag settings so that it's visible at a glance which sched-domains flags are set and which not. With the new balancer code we'll need to re-tune these details anyway, so make it cleaner to make fewer mistakes down the road ;-) Cc: Peter Zijlstra Cc: Andreas Herrmann Cc: Andreas Herrmann Cc: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/topology.h | 47 +++++---- include/linux/topology.h | 169 +++++++++++++++++++------------- 2 files changed, 129 insertions(+), 87 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index 066ef590d7e..be29eb81fb0 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -129,25 +129,34 @@ extern unsigned long node_remap_size[]; #endif /* sched_domains SD_NODE_INIT for NUMA machines */ -#define SD_NODE_INIT (struct sched_domain) { \ - .min_interval = 8, \ - .max_interval = 32, \ - .busy_factor = 32, \ - .imbalance_pct = 125, \ - .cache_nice_tries = SD_CACHE_NICE_TRIES, \ - .busy_idx = 3, \ - .idle_idx = SD_IDLE_IDX, \ - .newidle_idx = SD_NEWIDLE_IDX, \ - .wake_idx = 1, \ - .forkexec_idx = SD_FORKEXEC_IDX, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE \ - | SD_SERIALIZE, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ +#define SD_NODE_INIT (struct sched_domain) { \ + .min_interval = 8, \ + .max_interval = 32, \ + .busy_factor = 32, \ + .imbalance_pct = 125, \ + .cache_nice_tries = SD_CACHE_NICE_TRIES, \ + .busy_idx = 3, \ + .idle_idx = SD_IDLE_IDX, \ + .newidle_idx = SD_NEWIDLE_IDX, \ + .wake_idx = 1, \ + .forkexec_idx = SD_FORKEXEC_IDX, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 0*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_AFFINE \ + | 1*SD_WAKE_BALANCE \ + | 0*SD_SHARE_CPUPOWER \ + | 0*SD_POWERSAVINGS_BALANCE \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 1*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | 0*SD_PREFER_SIBLING \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ } #ifdef CONFIG_X86_64_ACPI_NUMA diff --git a/include/linux/topology.h b/include/linux/topology.h index 6203ae5067c..fe2c0329f82 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -85,21 +85,29 @@ int arch_update_cpu_topology(void); #define ARCH_HAS_SCHED_WAKE_IDLE /* Common values for SMT siblings */ #ifndef SD_SIBLING_INIT -#define SD_SIBLING_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 2, \ - .busy_factor = 64, \ - .imbalance_pct = 110, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_BALANCE_FORK \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE \ - | SD_SHARE_CPUPOWER, \ - .last_balance = jiffies, \ - .balance_interval = 1, \ - .smt_gain = 1178, /* 15% */ \ +#define SD_SIBLING_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 2, \ + .busy_factor = 64, \ + .imbalance_pct = 110, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_AFFINE \ + | 1*SD_WAKE_BALANCE \ + | 1*SD_SHARE_CPUPOWER \ + | 0*SD_POWERSAVINGS_BALANCE \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | 0*SD_PREFER_SIBLING \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ + .smt_gain = 1178, /* 15% */ \ } #endif #endif /* CONFIG_SCHED_SMT */ @@ -107,69 +115,94 @@ int arch_update_cpu_topology(void); #ifdef CONFIG_SCHED_MC /* Common values for MC siblings. for now mostly derived from SD_CPU_INIT */ #ifndef SD_MC_INIT -#define SD_MC_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .wake_idx = 1, \ - .forkexec_idx = 1, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_FORK \ - | SD_BALANCE_EXEC \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE \ - | SD_SHARE_PKG_RESOURCES\ - | sd_balance_for_mc_power()\ - | sd_power_saving_flags(),\ - .last_balance = jiffies, \ - .balance_interval = 1, \ +#define SD_MC_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .busy_idx = 2, \ + .wake_idx = 1, \ + .forkexec_idx = 1, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 0*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_AFFINE \ + | 1*SD_WAKE_BALANCE \ + | 0*SD_SHARE_CPUPOWER \ + | 1*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | sd_balance_for_mc_power() \ + | sd_power_saving_flags() \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ } #endif #endif /* CONFIG_SCHED_MC */ /* Common values for CPUs */ #ifndef SD_CPU_INIT -#define SD_CPU_INIT (struct sched_domain) { \ - .min_interval = 1, \ - .max_interval = 4, \ - .busy_factor = 64, \ - .imbalance_pct = 125, \ - .cache_nice_tries = 1, \ - .busy_idx = 2, \ - .idle_idx = 1, \ - .newidle_idx = 2, \ - .wake_idx = 1, \ - .forkexec_idx = 1, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_EXEC \ - | SD_BALANCE_FORK \ - | SD_WAKE_AFFINE \ - | SD_WAKE_BALANCE \ - | sd_balance_for_package_power()\ - | sd_power_saving_flags(),\ - .last_balance = jiffies, \ - .balance_interval = 1, \ +#define SD_CPU_INIT (struct sched_domain) { \ + .min_interval = 1, \ + .max_interval = 4, \ + .busy_factor = 64, \ + .imbalance_pct = 125, \ + .cache_nice_tries = 1, \ + .busy_idx = 2, \ + .idle_idx = 1, \ + .newidle_idx = 2, \ + .wake_idx = 1, \ + .forkexec_idx = 1, \ + \ + .flags = 1*SD_LOAD_BALANCE \ + | 0*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_EXEC \ + | 1*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 0*SD_WAKE_AFFINE \ + | 1*SD_WAKE_BALANCE \ + | 0*SD_SHARE_CPUPOWER \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 0*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | sd_balance_for_package_power() \ + | sd_power_saving_flags() \ + , \ + .last_balance = jiffies, \ + .balance_interval = 1, \ } #endif /* sched_domains SD_ALLNODES_INIT for NUMA machines */ -#define SD_ALLNODES_INIT (struct sched_domain) { \ - .min_interval = 64, \ - .max_interval = 64*num_online_cpus(), \ - .busy_factor = 128, \ - .imbalance_pct = 133, \ - .cache_nice_tries = 1, \ - .busy_idx = 3, \ - .idle_idx = 3, \ - .flags = SD_LOAD_BALANCE \ - | SD_BALANCE_NEWIDLE \ - | SD_WAKE_AFFINE \ - | SD_SERIALIZE, \ - .last_balance = jiffies, \ - .balance_interval = 64, \ +#define SD_ALLNODES_INIT (struct sched_domain) { \ + .min_interval = 64, \ + .max_interval = 64*num_online_cpus(), \ + .busy_factor = 128, \ + .imbalance_pct = 133, \ + .cache_nice_tries = 1, \ + .busy_idx = 3, \ + .idle_idx = 3, \ + .flags = 1*SD_LOAD_BALANCE \ + | 1*SD_BALANCE_NEWIDLE \ + | 0*SD_BALANCE_EXEC \ + | 0*SD_BALANCE_FORK \ + | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_AFFINE \ + | 0*SD_WAKE_BALANCE \ + | 0*SD_SHARE_CPUPOWER \ + | 0*SD_POWERSAVINGS_BALANCE \ + | 0*SD_SHARE_PKG_RESOURCES \ + | 1*SD_SERIALIZE \ + | 0*SD_WAKE_IDLE_FAR \ + | 0*SD_PREFER_SIBLING \ + , \ + .last_balance = jiffies, \ + .balance_interval = 64, \ } #ifdef CONFIG_NUMA From 840a0653100dbde599ae8ddf83fa214dfa5fd1aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 4 Sep 2009 11:32:54 +0200 Subject: [PATCH 55/64] sched: Turn on SD_BALANCE_NEWIDLE Start the re-tuning of the balancer by turning on newidle. It improves hackbench performance and parallelism on a 4x4 box. The "perf stat --repeat 10" measurements give us: domain0 domain1 ....................................... -SD_BALANCE_NEWIDLE -SD_BALANCE_NEWIDLE: 2041.273208 task-clock-msecs # 9.354 CPUs ( +- 0.363% ) +SD_BALANCE_NEWIDLE -SD_BALANCE_NEWIDLE: 2086.326925 task-clock-msecs # 11.934 CPUs ( +- 0.301% ) +SD_BALANCE_NEWIDLE +SD_BALANCE_NEWIDLE: 2115.289791 task-clock-msecs # 12.158 CPUs ( +- 0.263% ) Acked-by: Peter Zijlstra Cc: Andreas Herrmann Cc: Andreas Herrmann Cc: Gautham R Shenoy Cc: Balbir Singh LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/topology.h | 2 +- include/linux/topology.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index be29eb81fb0..ef7bc7fc252 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -142,7 +142,7 @@ extern unsigned long node_remap_size[]; .forkexec_idx = SD_FORKEXEC_IDX, \ \ .flags = 1*SD_LOAD_BALANCE \ - | 0*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ | 0*SD_WAKE_IDLE \ diff --git a/include/linux/topology.h b/include/linux/topology.h index fe2c0329f82..66774fddec9 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -126,7 +126,7 @@ int arch_update_cpu_topology(void); .forkexec_idx = 1, \ \ .flags = 1*SD_LOAD_BALANCE \ - | 0*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ | 0*SD_WAKE_IDLE \ @@ -160,7 +160,7 @@ int arch_update_cpu_topology(void); .forkexec_idx = 1, \ \ .flags = 1*SD_LOAD_BALANCE \ - | 0*SD_BALANCE_NEWIDLE \ + | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ | 0*SD_WAKE_IDLE \ From cdd2ab3de4301728b20efd6225681d3ff591a938 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Sep 2009 18:12:06 +0200 Subject: [PATCH 56/64] sched: Remove short cut from select_task_rq_fair() select_task_rq_fair() incorrectly skips the wake_affine() logic, remove this. When prev_cpu == this_cpu, the code jumps straight to the wake_idle() logic, this doesn't give the wake_affine() logic the chance to pin the task to this cpu. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2ff850f90d1..d7fda41ddaf 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1305,8 +1305,6 @@ static int select_task_rq_fair(struct task_struct *p, int sync) this_rq = cpu_rq(this_cpu); new_cpu = prev_cpu; - if (prev_cpu == this_cpu) - goto out; /* * 'this_sd' is the first domain that both * this_cpu and prev_cpu are present in: From 71a29aa7b600595d0ef373ea605ac656876d1f2f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Sep 2009 18:28:05 +0200 Subject: [PATCH 57/64] sched: Deal with low-load in wake_affine() wake_affine() would always fail under low-load situations where both prev and this were idle, because adding a single task will always be a significant imbalance, even if there's nothing around that could balance it. Deal with this by allowing imbalance when there's nothing you can do about it. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d7fda41ddaf..cc97ea498f2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1262,7 +1262,17 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq, tg = task_group(p); weight = p->se.load.weight; - balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + /* + * In low-load situations, where prev_cpu is idle and this_cpu is idle + * due to the sync cause above having dropped tl to 0, we'll always have + * an imbalance, but there's really nothing you can do about that, so + * that's good too. + * + * Otherwise check if either cpus are near enough in load to allow this + * task to be woken on this_cpu. + */ + balanced = !tl || + 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* From a8fae3ec5f118dc92517dcbed3ecf69ddb641d0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Sep 2009 18:32:32 +0200 Subject: [PATCH 58/64] sched: enable SD_WAKE_IDLE Now that SD_WAKE_IDLE doesn't make pipe-test suck anymore, enable it by default for MC, CPU and NUMA domains. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/include/asm/topology.h | 2 +- include/linux/topology.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index ef7bc7fc252..26d06e052a1 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -152,7 +152,7 @@ extern unsigned long node_remap_size[]; | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ | 1*SD_SERIALIZE \ - | 0*SD_WAKE_IDLE_FAR \ + | 1*SD_WAKE_IDLE_FAR \ | 0*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ diff --git a/include/linux/topology.h b/include/linux/topology.h index 66774fddec9..85e8cf7d393 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -129,7 +129,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_IDLE \ | 1*SD_WAKE_AFFINE \ | 1*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ @@ -163,7 +163,7 @@ int arch_update_cpu_topology(void); | 1*SD_BALANCE_NEWIDLE \ | 1*SD_BALANCE_EXEC \ | 1*SD_BALANCE_FORK \ - | 0*SD_WAKE_IDLE \ + | 1*SD_WAKE_IDLE \ | 0*SD_WAKE_AFFINE \ | 1*SD_WAKE_BALANCE \ | 0*SD_SHARE_CPUPOWER \ @@ -198,7 +198,7 @@ int arch_update_cpu_topology(void); | 0*SD_POWERSAVINGS_BALANCE \ | 0*SD_SHARE_PKG_RESOURCES \ | 1*SD_SERIALIZE \ - | 0*SD_WAKE_IDLE_FAR \ + | 1*SD_WAKE_IDLE_FAR \ | 0*SD_PREFER_SIBLING \ , \ .last_balance = jiffies, \ From b5d9d734a53e0204aab0089079cbde2a1285a38f Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 8 Sep 2009 11:12:28 +0200 Subject: [PATCH 59/64] sched: Ensure that a child can't gain time over it's parent after fork() A fork/exec load is usually "pass the baton", so the child should never be placed behind the parent. With START_DEBIT we make room for the new task, but with child_runs_first, that room comes out of the _parent's_ hide. There's nothing to say that the parent wasn't ahead of min_vruntime at fork() time, which means that the "baton carrier", who is essentially the parent in drag, can gain time and increase scheduling latencies for waiters. With NEW_FAIR_SLEEPERS + START_DEBIT + child_runs_first enabled, we essentially pass the sleeper fairness off to the child, which is fine, but if we don't base placement on the parent's updated vruntime, we can end up compounding latency woes if the child itself then does fork/exec. The debit incurred at fork doesn't hurt the parent who is then going to sleep and maybe exit, but the child who acquires the error harms all comers. This improves latencies of make -j kernel build workloads. Reported-by: Jens Axboe Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index cc97ea498f2..e386e5dfcda 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -728,11 +728,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) vruntime -= thresh; } - - /* ensure we never gain time by being placed backwards. */ - vruntime = max_vruntime(se->vruntime, vruntime); } + /* ensure we never gain time by being placed backwards. */ + vruntime = max_vruntime(se->vruntime, vruntime); + se->vruntime = vruntime; } @@ -1756,6 +1756,8 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) sched_info_queued(p); update_curr(cfs_rq); + if (curr) + se->vruntime = curr->vruntime; place_entity(cfs_rq, se, 1); /* 'curr' will be NULL if the child belongs to a different group */ From 2bba22c50b06abe9fd0d23933b1e64d35b419262 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 9 Sep 2009 15:41:37 +0200 Subject: [PATCH 60/64] sched: Turn off child_runs_first Set child_runs_first default to off. It hurts 'optimal' make -j workloads as make jobs get preempted by child tasks, reducing parallelism. Note, this patch might make existing races in user applications more prominent than before - so breakages might be bisected to this commit. Child-runs-first is broken on SMP to begin with, and we already had it off briefly in v2.6.23 so most of the offenders ought to be fixed. Would be nice not to revert this commit but fix those apps finally ... Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1252486344.28645.18.camel@marge.simson.net> [ made the sysctl independent of CONFIG_SCHED_DEBUG, in case people want to work around broken apps. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/sched_fair.c | 4 ++-- kernel/sysctl.c | 16 ++++++++-------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b7f43e3b73..3a50e822249 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1820,8 +1820,8 @@ extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_shares_ratelimit; extern unsigned int sysctl_sched_shares_thresh; -#ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_child_runs_first; +#ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e386e5dfcda..af325a382b1 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -48,10 +48,10 @@ unsigned int sysctl_sched_min_granularity = 4000000ULL; static unsigned int sched_nr_latency = 5; /* - * After fork, child runs first. (default) If set to 0 then + * After fork, child runs first. If set to 0 (default) then * parent will (try to) run first. */ -const_debug unsigned int sysctl_sched_child_runs_first = 1; +unsigned int sysctl_sched_child_runs_first __read_mostly; /* * sys_sched_yield() compat mode diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6c9836ef4b4..25d6bf3383b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -246,6 +246,14 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ #endif static struct ctl_table kern_table[] = { + { + .ctl_name = CTL_UNNUMBERED, + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #ifdef CONFIG_SCHED_DEBUG { .ctl_name = CTL_UNNUMBERED, @@ -298,14 +306,6 @@ static struct ctl_table kern_table[] = { .strategy = &sysctl_intvec, .extra1 = &zero, }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, { .ctl_name = CTL_UNNUMBERED, .procname = "sched_features", From 172e082a9111ea504ee34cbba26284a5ebdc53a7 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 9 Sep 2009 15:41:37 +0200 Subject: [PATCH 61/64] sched: Re-tune the scheduler latency defaults to decrease worst-case latencies Reduce the latency target from 20 msecs to 5 msecs. Why? Larger latencies increase spread, which is good for scaling, but bad for worst case latency. We still have the ilog(nr_cpus) rule to scale up on bigger server boxes. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1252486344.28645.18.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index af325a382b1..26fadb44250 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -24,7 +24,7 @@ /* * Targeted preemption latency for CPU-bound tasks: - * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -34,13 +34,13 @@ * (to see the precise effective timeslice length of your workload, * run vmstat and monitor the context-switches (cs) field) */ -unsigned int sysctl_sched_latency = 20000000ULL; +unsigned int sysctl_sched_latency = 5000000ULL; /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 4000000ULL; +unsigned int sysctl_sched_min_granularity = 1000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity @@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_OTHER wake-up granularity. - * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 5000000UL; +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; From 61cbe54d9479ad98283b2dda686deae4c34b2d59 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 9 Sep 2009 15:41:37 +0200 Subject: [PATCH 62/64] sched: Keep kthreads at default priority Removes kthread/workqueue priority boost, they increase worst-case desktop latencies. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1252486344.28645.18.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- kernel/kthread.c | 4 ---- kernel/workqueue.c | 2 -- 2 files changed, 6 deletions(-) diff --git a/kernel/kthread.c b/kernel/kthread.c index eb8751aa041..5fe709982ca 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -16,8 +16,6 @@ #include #include -#define KTHREAD_NICE_LEVEL (-5) - static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; @@ -145,7 +143,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), * The kernel thread should not inherit these properties. */ sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); - set_user_nice(create.result, KTHREAD_NICE_LEVEL); set_cpus_allowed_ptr(create.result, cpu_all_mask); } return create.result; @@ -221,7 +218,6 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_user_nice(tsk, KTHREAD_NICE_LEVEL); set_cpus_allowed_ptr(tsk, cpu_all_mask); set_mems_allowed(node_possible_map); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0668795d881..ea1b4e7674d 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -317,8 +317,6 @@ static int worker_thread(void *__cwq) if (cwq->wq->freezeable) set_freezable(); - set_user_nice(current, -5); - for (;;) { prepare_to_wait(&cwq->more_work, &wait, TASK_INTERRUPTIBLE); if (!freezing(current) && From 3f2aa307c4d26b4ed6509d0a79e8254c9e07e921 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 10 Sep 2009 20:34:48 +0200 Subject: [PATCH 63/64] sched: Disable NEW_FAIR_SLEEPERS for now Nikos Chantziaras and Jens Axboe reported that turning off NEW_FAIR_SLEEPERS improves desktop interactivity visibly. Nikos described his experiences the following way: " With this setting, I can do "nice -n 19 make -j20" and still have a very smooth desktop and watch a movie at the same time. Various other annoyances (like the "logout/shutdown/restart" dialog of KDE not appearing at all until the background fade-out effect has finished) are also gone. So this seems to be the single most important setting that vastly improves desktop behavior, at least here. " Jens described it the following way, referring to a 10-seconds xmodmap scheduling delay he was trying to debug: " Then I tried switching NO_NEW_FAIR_SLEEPERS on, and then I get: Performance counter stats for 'xmodmap .xmodmap-carl': 9.009137 task-clock-msecs # 0.447 CPUs 18 context-switches # 0.002 M/sec 1 CPU-migrations # 0.000 M/sec 315 page-faults # 0.035 M/sec 0.020167093 seconds time elapsed Woot! " So disable it for now. In perf trace output i can see weird delta timestamps: cc1-9943 [001] 2802.059479616: sched_stat_wait: task: as:9944 wait: 2801938766276 [ns] That nsec field is not supposed to be that large. More digging is needed - but lets turn it off while the real bug is found. Reported-by: Nikos Chantziaras Tested-by: Nikos Chantziaras Reported-by: Jens Axboe Tested-by: Jens Axboe Acked-by: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <4AA93D34.8040500@arcor.de> Signed-off-by: Ingo Molnar --- kernel/sched_features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 4569bfa7df9..e2dc63a5815 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,4 +1,4 @@ -SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) +SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) SCHED_FEAT(NORMALIZED_SLEEPER, 0) SCHED_FEAT(ADAPTIVE_GRAN, 1) SCHED_FEAT(WAKEUP_PREEMPT, 1) From e1f8450854d69f0291882804406ea1bab3ca44b4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 10 Sep 2009 20:52:09 +0200 Subject: [PATCH 64/64] sched: Fix sched::sched_stat_wait tracepoint field This weird perf trace output: cc1-9943 [001] 2802.059479616: sched_stat_wait: task: as:9944 wait: 2801938766276 [ns] Is caused by setting one component field of the delta to zero a bit too early. Move it to later. ( Note, this does not affect the NEW_FAIR_SLEEPERS interactivity bug, it's just a reporting bug in essence. ) Acked-by: Peter Zijlstra Cc: Nikos Chantziaras Cc: Jens Axboe Cc: Mike Galbraith LKML-Reference: <4AA93D34.8040500@arcor.de> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 26fadb44250..aa7f8412101 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -545,14 +545,13 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) schedstat_set(se->wait_count, se->wait_count + 1); schedstat_set(se->wait_sum, se->wait_sum + rq_of(cfs_rq)->clock - se->wait_start); - schedstat_set(se->wait_start, 0); - #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { trace_sched_stat_wait(task_of(se), rq_of(cfs_rq)->clock - se->wait_start); } #endif + schedstat_set(se->wait_start, 0); } static inline void