Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (31 commits) sched: fix warning in fs/proc/base.c schedstat: consolidate per-task cpu runtime stats sched: use RCU variant of list traversal in for_each_leaf_rt_rq() sched, cpuacct: export percpu cpuacct cgroup stats sched, cpuacct: refactoring cpuusage_read / cpuusage_write sched: optimize update_curr() sched: fix wakeup preemption clock sched: add missing arch_update_cpu_topology() call sched: let arch_update_cpu_topology indicate if topology changed sched: idle_balance() does not call load_balance_newidle() sched: fix sd_parent_degenerate on non-numa smp machine sched: add uid information to sched_debug for CONFIG_USER_SCHED sched: move double_unlock_balance() higher sched: update comment for move_task_off_dead_cpu sched: fix inconsistency when redistribute per-cpu tg->cfs_rq shares sched/rt: removed unneeded defintion sched: add hierarchical accounting to cpu accounting controller sched: include group statistics in /proc/sched_debug sched: rename SCHED_NO_NO_OMIT_FRAME_POINTER => SCHED_OMIT_FRAME_POINTER sched: clean up SCHED_CPUMASK_ALLOC ...
2024-12-28 19:56:18 +00:00 · 2008-12-28 12:27:58 -08:00 · 2008-12-28 12:27:58 -08:00 · a39b863342
commit a39b863342
parent b0f4b285d7 4e202284e6
20 changed files with 299 additions and 222 deletions
--- a/Documentation/controllers/cpuacct.txt
+++ b/Documentation/controllers/cpuacct.txt
@ -0,0 +1,32 @@
 CPU Accounting Controller
 -------------------------
 The CPU accounting controller is used to group tasks using cgroups and
 account the CPU usage of these groups of tasks.
 The CPU accounting controller supports multi-hierarchy groups. An accounting
 group accumulates the CPU usage of all of its child groups and the tasks
 directly present in its group.
 Accounting groups can be created by first mounting the cgroup filesystem.
 # mkdir /cgroups
 # mount -t cgroup -ocpuacct none /cgroups
 With the above step, the initial or the parent accounting group
 becomes visible at /cgroups. At bootup, this group includes all the
 tasks in the system. /cgroups/tasks lists the tasks in this cgroup.
 /cgroups/cpuacct.usage gives the CPU time (in nanoseconds) obtained by
 this group which is essentially the CPU time obtained by all the tasks
 in the system.
 New accounting groups can be created under the parent group /cgroups.
 # cd /cgroups
 # mkdir g1
 # echo $$ > g1
 The above steps create a new group g1 and move the current shell
 process (bash) into it. CPU time consumed by this bash and its children
 can be obtained from g1/cpuacct.usage and the same is accumulated in
 /cgroups/cpuacct.usage also.
--- a/Documentation/scheduler/sched-arch.txt
+++ b/Documentation/scheduler/sched-arch.txt
@ -8,7 +8,7 @@ Context switch
 By default, the switch_to arch function is called with the runqueue
 locked. This is usually not a problem unless switch_to may need to
 take the runqueue lock. This is usually due to a wake up operation in
-the context switch. See include/asm-ia64/system.h for an example.
+the context switch. See arch/ia64/include/asm/system.h for an example.
 To request the scheduler call switch_to with the runqueue unlocked,
 you must `#define __ARCH_WANT_UNLOCKED_CTXSW` in a header file
@ -23,7 +23,7 @@ disabled. Interrupts may be enabled over the call if it is likely to
 introduce a significant interrupt latency by adding the line
 `#define __ARCH_WANT_INTERRUPTS_ON_CTXSW` in the same place as for
 unlocked context switches. This define also implies
-`__ARCH_WANT_UNLOCKED_CTXSW`. See include/asm-arm/system.h for an
+`__ARCH_WANT_UNLOCKED_CTXSW`. See arch/arm/include/asm/system.h for an
 example.
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@ -99,7 +99,7 @@ config GENERIC_IOMAP
 	bool
 	default y
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
--- a/arch/m32r/Kconfig
+++ b/arch/m32r/Kconfig
@ -273,7 +273,7 @@ config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
        bool
        default y
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@ -653,7 +653,7 @@ config GENERIC_CMOS_UPDATE
 	bool
 	default y
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@ -141,7 +141,7 @@ config GENERIC_NVRAM
 	bool
 	default y if PPC32
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	bool
 	default y
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@ -212,7 +212,7 @@ static void update_cpu_core_map(void)
 		cpu_core_map[cpu] = cpu_coregroup_map(cpu);
 }
-void arch_update_cpu_topology(void)
+int arch_update_cpu_topology(void)
 {
 	struct tl_info *info = tl_info;
 	struct sys_device *sysdev;
@ -221,7 +221,7 @@ void arch_update_cpu_topology(void)
 	if (!machine_has_topology) {
 		update_cpu_core_map();
 		topology_update_polarization_simple();
-		return;
+		return 0;
 	}
 	stsi(info, 15, 1, 2);
 	tl_to_cores(info);
@ -230,6 +230,7 @@ void arch_update_cpu_topology(void)
 		sysdev = get_cpu_sysdev(cpu);
 		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
 	}
 	return 1;
 }
 static void topology_work_fn(struct work_struct *work)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@ -368,10 +368,10 @@ config X86_RDC321X
 	  as R-8610-(G).
 	  If you don't have one of these chips, you should say N here.
-config SCHED_NO_NO_OMIT_FRAME_POINTER
+config SCHED_OMIT_FRAME_POINTER
 	def_bool y
 	prompt "Single-depth WCHAN output"
-	depends on X86_32
+	depends on X86
 	help
 	  Calculate simpler /proc/<PID>/wchan values. If this option
 	  is disabled then wchan values will recurse back to the
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@ -347,8 +347,8 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer)
 static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 {
 	return sprintf(buffer, "%llu %llu %lu\n",
-			task->sched_info.cpu_time,
+			(unsigned long long)task->se.sum_exec_runtime,
-			task->sched_info.run_delay,
+			(unsigned long long)task->sched_info.run_delay,
 			task->sched_info.pcount);
 }
 #endif
--- a/include/asm-m32r/system.h
+++ b/include/asm-m32r/system.h
@ -23,7 +23,7 @@
 */
 #if defined(CONFIG_FRAME_POINTER) || \
-	!defined(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER)
+	!defined(CONFIG_SCHED_OMIT_FRAME_POINTER)
 #define M32R_PUSH_FP "	push fp\n"
 #define M32R_POP_FP  "	pop  fp\n"
 #else
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -260,8 +260,6 @@ static inline int select_nohz_load_balancer(int cpu)
 }
 #endif
 extern unsigned long rt_needs_cpu(int cpu);
 /*
 * Only dump TASK_* tasks. (0 for all tasks)
 */
@ -669,8 +667,7 @@ struct reclaim_state;
 struct sched_info {
 	/* cumulative counters */
 	unsigned long pcount;	      /* # of times run on this cpu */
-	unsigned long long cpu_time,  /* time spent on the cpu */
+	unsigned long long run_delay; /* time spent waiting on a runqueue */
 			   run_delay; /* time spent waiting on a runqueue */
 	/* timestamps */
 	unsigned long long last_arrival,/* when we last ran on a cpu */
@ -2210,6 +2207,7 @@ extern void normalize_rt_tasks(void);
 extern struct task_group init_task_group;
 #ifdef CONFIG_USER_SCHED
 extern struct task_group root_task_group;
 extern void set_tg_uid(struct user_struct *user);
 #endif
 extern struct task_group *sched_create_group(struct task_group *parent);
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@ -49,7 +49,7 @@
 	for_each_online_node(node)			\
 		if (nr_cpus_node(node))
-void arch_update_cpu_topology(void);
+int arch_update_cpu_topology(void);
 /* Conform to ACPI 2.0 SLIT distance definitions */
 #define LOCAL_DISTANCE		10
--- a/kernel/Makefile
+++ b/kernel/Makefile
@ -19,7 +19,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_sched.o = -pg
 endif
 obj-$(CONFIG_FREEZER) += freezer.o
@ -90,7 +89,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
-ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
 # me.  I suspect most platforms don't need this, but until we know that for sure
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@ -127,7 +127,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk->sched_info.cpu_time;
+	t3 = tsk->se.sum_exec_runtime;
 	d->cpu_count += t1;
--- a/kernel/sched.c
+++ b/kernel/sched.c
@ -267,6 +267,10 @@ struct task_group {
 	struct cgroup_subsys_state css;
 #endif
 #ifdef CONFIG_USER_SCHED
 	uid_t uid;
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* schedulable entities of this group on each cpu */
 	struct sched_entity **se;
@ -292,6 +296,12 @@ struct task_group {
 #ifdef CONFIG_USER_SCHED
 /* Helper function to pass uid information to create_sched_user() */
 void set_tg_uid(struct user_struct *user)
 {
 	user->tg->uid = user->uid;
 }
 /*
 * Root task group.
 * 	Every UID task group (including init_task_group aka UID-0) will
@ -594,6 +604,8 @@ struct rq {
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	struct sched_info rq_sched_info;
 	unsigned long long rq_cpu_time;
 	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 	/* sys_sched_yield() stats */
 	unsigned int yld_exp_empty;
@ -711,45 +723,18 @@ static __read_mostly char *sched_feat_names[] = {
 #undef SCHED_FEAT
-static int sched_feat_open(struct inode *inode, struct file *filp)
+static int sched_feat_show(struct seq_file *m, void *v)
 {
 	filp->private_data = inode->i_private;
 	return 0;
 }
 static ssize_t
 sched_feat_read(struct file *filp, char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
 	char *buf;
 	int r = 0;
 	int len = 0;
 	int i;
 	for (i = 0; sched_feat_names[i]; i++) {
-		len += strlen(sched_feat_names[i]);
+		if (!(sysctl_sched_features & (1UL << i)))
-		len += 4;
+			seq_puts(m, "NO_");
 		seq_printf(m, "%s ", sched_feat_names[i]);
 	}
 	seq_puts(m, "\n");
-	buf = kmalloc(len + 2, GFP_KERNEL);
+	return 0;
 	if (!buf)
 		return -ENOMEM;
 	for (i = 0; sched_feat_names[i]; i++) {
 		if (sysctl_sched_features & (1UL << i))
 			r += sprintf(buf + r, "%s ", sched_feat_names[i]);
 		else
 			r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]);
 	}
 	r += sprintf(buf + r, "\n");
 	WARN_ON(r >= len + 2);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 	kfree(buf);
 	return r;
 }
 static ssize_t
@ -794,10 +779,17 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 static int sched_feat_open(struct inode *inode, struct file *filp)
 {
 	return single_open(filp, sched_feat_show, NULL);
 }
 static struct file_operations sched_feat_fops = {
 	.open		= sched_feat_open,
 	.read	= sched_feat_read,
 	.write		= sched_feat_write,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= single_release,
 };
 static __init int sched_init_debug(void)
@ -1482,27 +1474,13 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
 			unsigned long sd_shares, unsigned long sd_rq_weight)
 {
 	int boost = 0;
 	unsigned long shares;
 	unsigned long rq_weight;
 	if (!tg->se[cpu])
 		return;
-	rq_weight = tg->cfs_rq[cpu]->load.weight;
+	rq_weight = tg->cfs_rq[cpu]->rq_weight;
 	/*
 	 * If there are currently no tasks on the cpu pretend there is one of
 	 * average load so that when a new task gets to run here it will not
 	 * get delayed by group starvation.
 	 */
 	if (!rq_weight) {
 		boost = 1;
 		rq_weight = NICE_0_LOAD;
 	}
 	if (unlikely(rq_weight > sd_rq_weight))
 		rq_weight = sd_rq_weight;
 	/*
 	 *           \Sum shares * rq_weight
@ -1510,7 +1488,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 	 *               \Sum rq_weight
 	 *
 	 */
-	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+	shares = (sd_shares * rq_weight) / sd_rq_weight;
 	shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
 	if (abs(shares - tg->se[cpu]->load.weight) >
@ -1519,11 +1497,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 		unsigned long flags;
 		spin_lock_irqsave(&rq->lock, flags);
-		/*
+		tg->cfs_rq[cpu]->shares = shares;
 		 * record the actual number of shares, not the boosted amount.
 		 */
 		tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
 		tg->cfs_rq[cpu]->rq_weight = rq_weight;
 		__set_se_shares(tg->se[cpu], shares);
 		spin_unlock_irqrestore(&rq->lock, flags);
@ -1537,13 +1511,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
 */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-	unsigned long rq_weight = 0;
+	unsigned long weight, rq_weight = 0;
 	unsigned long shares = 0;
 	struct sched_domain *sd = data;
 	int i;
 	for_each_cpu_mask(i, sd->span) {
-		rq_weight += tg->cfs_rq[i]->load.weight;
+		/*
 		 * If there are currently no tasks on the cpu pretend there
 		 * is one of average load so that when a new task gets to
 		 * run here it will not get delayed by group starvation.
 		 */
 		weight = tg->cfs_rq[i]->load.weight;
 		if (!weight)
 			weight = NICE_0_LOAD;
 		tg->cfs_rq[i]->rq_weight = weight;
 		rq_weight += weight;
 		shares += tg->cfs_rq[i]->shares;
 	}
@ -1553,9 +1537,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
 	if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
 		shares = tg->shares;
 	if (!rq_weight)
 		rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
 	for_each_cpu_mask(i, sd->span)
 		update_group_shares_cpu(tg, i, shares, rq_weight);
@ -1620,6 +1601,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 #endif
 /*
 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
 */
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	int ret = 0;
 	if (unlikely(!irqs_disabled())) {
 		/* printk() doesn't work good under rq->lock */
 		spin_unlock(&this_rq->lock);
 		BUG_ON(1);
 	}
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
 			ret = 1;
 		} else
 			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
 	}
 	return ret;
 }
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(busiest->lock)
 {
 	spin_unlock(&busiest->lock);
 	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
@ -2264,6 +2278,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	smp_wmb();
 	rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
 	old_state = p->state;
 	if (!(old_state & state))
 		goto out;
@ -2321,7 +2336,6 @@ out_activate:
 		schedstat_inc(p, se.nr_wakeups_local);
 	else
 		schedstat_inc(p, se.nr_wakeups_remote);
 	update_rq_clock(rq);
 	activate_task(rq, p, 1);
 	success = 1;
@ -2821,40 +2835,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 		__release(rq2->lock);
 }
 /*
 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
 */
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	int ret = 0;
 	if (unlikely(!irqs_disabled())) {
 		/* printk() doesn't work good under rq->lock */
 		spin_unlock(&this_rq->lock);
 		BUG_ON(1);
 	}
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
 			ret = 1;
 		} else
 			spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
 	}
 	return ret;
 }
 static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(busiest->lock)
 {
 	spin_unlock(&busiest->lock);
 	lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
 /*
 * If dest_cpu is allowed for this process, migrate the task to it.
 * This is accomplished by forcing the cpu_allowed mask to only
@ -3716,7 +3696,7 @@ out_balanced:
 static void idle_balance(int this_cpu, struct rq *this_rq)
 {
 	struct sched_domain *sd;
-	int pulled_task = -1;
+	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
 	cpumask_t tmpmask;
@ -6150,7 +6130,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
 /*
 * Figure out where task on dead CPU should go, use force if necessary.
 * NOTE: interrupts should be disabled by the caller
 */
 static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
 {
@ -6662,28 +6641,6 @@ early_initcall(migration_init);
 #ifdef CONFIG_SCHED_DEBUG
 static inline const char *sd_level_to_string(enum sched_domain_level lvl)
 {
 	switch (lvl) {
 	case SD_LV_NONE:
 			return "NONE";
 	case SD_LV_SIBLING:
 			return "SIBLING";
 	case SD_LV_MC:
 			return "MC";
 	case SD_LV_CPU:
 			return "CPU";
 	case SD_LV_NODE:
 			return "NODE";
 	case SD_LV_ALLNODES:
 			return "ALLNODES";
 	case SD_LV_MAX:
 			return "MAX";
 	}
 	return "MAX";
 }
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 				  cpumask_t *groupmask)
 {
@ -6703,8 +6660,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		return -1;
 	}
-	printk(KERN_CONT "span %s level %s\n",
+	printk(KERN_CONT "span %s level %s\n", str, sd->name);
 		str, sd_level_to_string(sd->level));
 	if (!cpu_isset(cpu, sd->span)) {
 		printk(KERN_ERR "ERROR: domain->span does not contain "
@ -6840,6 +6796,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 				SD_BALANCE_EXEC |
 				SD_SHARE_CPUPOWER |
 				SD_SHARE_PKG_RESOURCES);
 		if (nr_node_ids == 1)
 			pflags &= ~SD_SERIALIZE;
 	}
 	if (~cflags & pflags)
 		return 0;
@ -7360,13 +7318,21 @@ struct allmasks {
 };
 #if	NR_CPUS > 128
 #define	SCHED_CPUMASK_ALLOC		1
 #define	SCHED_CPUMASK_FREE(v)		kfree(v)
 #define SCHED_CPUMASK_DECLARE(v)	struct allmasks *v
 static inline void sched_cpumask_alloc(struct allmasks **masks)
 {
 	*masks = kmalloc(sizeof(**masks), GFP_KERNEL);
 }
 static inline void sched_cpumask_free(struct allmasks *masks)
 {
 	kfree(masks);
 }
 #else
 #define	SCHED_CPUMASK_ALLOC		0
 #define	SCHED_CPUMASK_FREE(v)
 #define SCHED_CPUMASK_DECLARE(v)	struct allmasks _v, *v = &_v
 static inline void sched_cpumask_alloc(struct allmasks **masks)
 { }
 static inline void sched_cpumask_free(struct allmasks *masks)
 { }
 #endif
 #define	SCHED_CPUMASK_VAR(v, a) 	cpumask_t *v = (cpumask_t *) \
@ -7442,9 +7408,8 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		return -ENOMEM;
 	}
 #if SCHED_CPUMASK_ALLOC
 	/* get space for all scratch cpumask variables */
-	allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
+	sched_cpumask_alloc(&allmasks);
 	if (!allmasks) {
 		printk(KERN_WARNING "Cannot alloc cpumask array\n");
 		kfree(rd);
@ -7453,7 +7418,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 #endif
 		return -ENOMEM;
 	}
-#endif
+
 	tmpmask = (cpumask_t *)allmasks;
@ -7707,13 +7672,13 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
 		cpu_attach_domain(sd, rd, i);
 	}
-	SCHED_CPUMASK_FREE((void *)allmasks);
+	sched_cpumask_free(allmasks);
 	return 0;
 #ifdef CONFIG_NUMA
 error:
 	free_sched_groups(cpu_map, tmpmask);
-	SCHED_CPUMASK_FREE((void *)allmasks);
+	sched_cpumask_free(allmasks);
 	kfree(rd);
 	return -ENOMEM;
 #endif
@ -7736,8 +7701,14 @@ static struct sched_domain_attr *dattr_cur;
 */
 static cpumask_t fallback_doms;
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
 * arch_update_cpu_topology lets virtualized architectures update the
 * cpu core maps. It is supposed to return 1 if the topology changed
 * or 0 if it stayed the same.
 */
 int __attribute__((weak)) arch_update_cpu_topology(void)
 {
 	return 0;
 }
 /*
@ -7777,8 +7748,6 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 	cpumask_t tmpmask;
 	int i;
 	unregister_sched_domain_sysctl();
 	for_each_cpu_mask_nr(i, *cpu_map)
 		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
@ -7831,17 +7800,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
 			     struct sched_domain_attr *dattr_new)
 {
 	int i, j, n;
 	int new_topology;
 	mutex_lock(&sched_domains_mutex);
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 	/* Let architecture update cpu core mappings. */
 	new_topology = arch_update_cpu_topology();
 	n = doms_new ? ndoms_new : 0;
 	/* Destroy deleted domains */
 	for (i = 0; i < ndoms_cur; i++) {
-		for (j = 0; j < n; j++) {
+		for (j = 0; j < n && !new_topology; j++) {
 			if (cpus_equal(doms_cur[i], doms_new[j])
 			    && dattrs_equal(dattr_cur, i, dattr_new, j))
 				goto match1;
@ -7856,12 +7829,12 @@ match1:
 		ndoms_cur = 0;
 		doms_new = &fallback_doms;
 		cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
-		dattr_new = NULL;
+		WARN_ON_ONCE(dattr_new);
 	}
 	/* Build new domains */
 	for (i = 0; i < ndoms_new; i++) {
-		for (j = 0; j < ndoms_cur; j++) {
+		for (j = 0; j < ndoms_cur && !new_topology; j++) {
 			if (cpus_equal(doms_new[i], doms_cur[j])
 			    && dattrs_equal(dattr_new, i, dattr_cur, j))
 				goto match2;
@ -8516,7 +8489,7 @@ static
 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se, *parent_se;
+	struct sched_entity *se;
 	struct rq *rq;
 	int i;
@ -8532,18 +8505,17 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
-		cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
+		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+				      GFP_KERNEL, cpu_to_node(i));
 		if (!cfs_rq)
 			goto err;
-		se = kmalloc_node(sizeof(struct sched_entity),
+		se = kzalloc_node(sizeof(struct sched_entity),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+				  GFP_KERNEL, cpu_to_node(i));
 		if (!se)
 			goto err;
-		parent_se = parent ? parent->se[i] : NULL;
+		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent_se);
 	}
 	return 1;
@ -8604,7 +8576,7 @@ static
 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
 	struct rt_rq *rt_rq;
-	struct sched_rt_entity *rt_se, *parent_se;
+	struct sched_rt_entity *rt_se;
 	struct rq *rq;
 	int i;
@ -8621,18 +8593,17 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
-		rt_rq = kmalloc_node(sizeof(struct rt_rq),
+		rt_rq = kzalloc_node(sizeof(struct rt_rq),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_rq)
 			goto err;
-		rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
+		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
-				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+				     GFP_KERNEL, cpu_to_node(i));
 		if (!rt_se)
 			goto err;
-		parent_se = parent ? parent->rt_se[i] : NULL;
+		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]);
 		init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent_se);
 	}
 	return 1;
@ -9275,11 +9246,12 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 * (balbir@in.ibm.com).
 */
-/* track cpu usage of a group of tasks */
+/* track cpu usage of a group of tasks and its child groups */
 struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
 	u64 *cpuusage;
 	struct cpuacct *parent;
 };
 struct cgroup_subsys cpuacct_subsys;
@ -9313,6 +9285,9 @@ static struct cgroup_subsys_state *cpuacct_create(
 		return ERR_PTR(-ENOMEM);
 	}
 	if (cgrp->parent)
 		ca->parent = cgroup_ca(cgrp->parent);
 	return &ca->css;
 }
@ -9326,6 +9301,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	kfree(ca);
 }
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
 	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 #ifndef CONFIG_64BIT
 	/*
 	 * Take rq->lock to make 64-bit read safe on 32-bit platforms.
 	 */
 	spin_lock_irq(&cpu_rq(cpu)->lock);
 	data = *cpuusage;
 	spin_unlock_irq(&cpu_rq(cpu)->lock);
 #else
 	data = *cpuusage;
 #endif
 	return data;
 }
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
 	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 #ifndef CONFIG_64BIT
 	/*
 	 * Take rq->lock to make 64-bit write safe on 32-bit platforms.
 	 */
 	spin_lock_irq(&cpu_rq(cpu)->lock);
 	*cpuusage = val;
 	spin_unlock_irq(&cpu_rq(cpu)->lock);
 #else
 	*cpuusage = val;
 #endif
 }
 /* return total cpu usage (in nanoseconds) of a group */
 static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 {
@ -9333,17 +9343,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
 	u64 totalcpuusage = 0;
 	int i;
-	for_each_possible_cpu(i) {
+	for_each_present_cpu(i)
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+		totalcpuusage += cpuacct_cpuusage_read(ca, i);
 		/*
 		 * Take rq->lock to make 64-bit addition safe on 32-bit
 		 * platforms.
 		 */
 		spin_lock_irq(&cpu_rq(i)->lock);
 		totalcpuusage += *cpuusage;
 		spin_unlock_irq(&cpu_rq(i)->lock);
 	}
 	return totalcpuusage;
 }
@ -9360,23 +9361,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
 		goto out;
 	}
-	for_each_possible_cpu(i) {
+	for_each_present_cpu(i)
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+		cpuacct_cpuusage_write(ca, i, 0);
 		spin_lock_irq(&cpu_rq(i)->lock);
 		*cpuusage = 0;
 		spin_unlock_irq(&cpu_rq(i)->lock);
 	}
 out:
 	return err;
 }
 static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
 				   struct seq_file *m)
 {
 	struct cpuacct *ca = cgroup_ca(cgroup);
 	u64 percpu;
 	int i;
 	for_each_present_cpu(i) {
 		percpu = cpuacct_cpuusage_read(ca, i);
 		seq_printf(m, "%llu ", (unsigned long long) percpu);
 	}
 	seq_printf(m, "\n");
 	return 0;
 }
 static struct cftype files[] = {
 	{
 		.name = "usage",
 		.read_u64 = cpuusage_read,
 		.write_u64 = cpuusage_write,
 	},
 	{
 		.name = "usage_percpu",
 		.read_seq_string = cpuacct_percpu_seq_read,
 	},
 };
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@ -9392,14 +9409,16 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
 	struct cpuacct *ca;
 	int cpu;
 	if (!cpuacct_subsys.active)
 		return;
 	cpu = task_cpu(tsk);
 	ca = task_ca(tsk);
 	if (ca) {
 		u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
 	for (; ca; ca = ca->parent) {
 		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 }
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@ -53,6 +53,40 @@ static unsigned long nsec_low(unsigned long long nsec)
 #define SPLIT_NS(x) nsec_high(x), nsec_low(x)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void print_cfs_group_stats(struct seq_file *m, int cpu,
 		struct task_group *tg)
 {
 	struct sched_entity *se = tg->se[cpu];
 	if (!se)
 		return;
 #define P(F) \
 	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
 #define PN(F) \
 	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
 	PN(se->exec_start);
 	PN(se->vruntime);
 	PN(se->sum_exec_runtime);
 #ifdef CONFIG_SCHEDSTATS
 	PN(se->wait_start);
 	PN(se->sleep_start);
 	PN(se->block_start);
 	PN(se->sleep_max);
 	PN(se->block_max);
 	PN(se->exec_max);
 	PN(se->slice_max);
 	PN(se->wait_max);
 	PN(se->wait_sum);
 	P(se->wait_count);
 #endif
 	P(se->load.weight);
 #undef PN
 #undef P
 }
 #endif
 static void
 print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 {
@ -121,20 +155,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
 	char path[128] = "";
 	struct cgroup *cgroup = NULL;
 	struct task_group *tg = cfs_rq->tg;
-	if (tg)
+	cgroup_path(tg->css.cgroup, path, sizeof(path));
 		cgroup = tg->css.cgroup;
 	if (cgroup)
 		cgroup_path(cgroup, path, sizeof(path));
 	SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
 #elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
 	{
 		uid_t uid = cfs_rq->tg->uid;
 		SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
 	}
 #else
 	SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
 #endif
 	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "exec_clock",
 			SPLIT_NS(cfs_rq->exec_clock));
@ -168,6 +201,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #ifdef CONFIG_SMP
 	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
 #endif
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
@ -175,14 +209,9 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
 {
 #if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
 	char path[128] = "";
 	struct cgroup *cgroup = NULL;
 	struct task_group *tg = rt_rq->tg;
-	if (tg)
+	cgroup_path(tg->css.cgroup, path, sizeof(path));
 		cgroup = tg->css.cgroup;
 	if (cgroup)
 		cgroup_path(cgroup, path, sizeof(path));
 	SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
 #else
@ -272,7 +301,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
-	SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@ -492,6 +492,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	 * overflow on 32 bits):
 	 */
 	delta_exec = (unsigned long)(now - curr->exec_start);
 	if (!delta_exec)
 		return;
 	__update_curr(cfs_rq, curr, delta_exec);
 	curr->exec_start = now;
@ -1345,12 +1347,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 {
 	struct task_struct *curr = rq->curr;
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	if (unlikely(rt_prio(p->prio))) {
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 		update_rq_clock(rq);
 	update_curr(cfs_rq);
 	if (unlikely(rt_prio(p->prio))) {
 		resched_task(curr);
 		return;
 	}
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@ -77,7 +77,7 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 }
 #define for_each_leaf_rt_rq(rt_rq, rq) \
-	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 {
@ -537,14 +537,14 @@ static void update_curr_rt(struct rq *rq)
 	for_each_sched_rt_entity(rt_se) {
 		rt_rq = rt_rq_of_se(rt_se);
 		spin_lock(&rt_rq->rt_runtime_lock);
 		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 			spin_lock(&rt_rq->rt_runtime_lock);
 			rt_rq->rt_time += delta_exec;
 			if (sched_rt_runtime_exceeded(rt_rq))
 				resched_task(curr);
 		}
 			spin_unlock(&rt_rq->rt_runtime_lock);
 		}
 	}
 }
 static inline
@ -909,9 +909,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@ -31,7 +31,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
 		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
 		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
 		    rq->ttwu_count, rq->ttwu_local,
-		    rq->rq_sched_info.cpu_time,
+		    rq->rq_cpu_time,
 		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
 		seq_printf(seq, "\n");
@ -123,7 +123,7 @@ static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {
 	if (rq)
-		rq->rq_sched_info.cpu_time += delta;
+		rq->rq_cpu_time += delta;
 }
 static inline void
@ -236,7 +236,6 @@ static inline void sched_info_depart(struct task_struct *t)
 	unsigned long long delta = task_rq(t)->clock -
 					t->sched_info.last_arrival;
 	t->sched_info.cpu_time += delta;
 	rq_sched_info_depart(task_rq(t), delta);
 	if (t->state == TASK_RUNNING)
--- a/kernel/user.c
+++ b/kernel/user.c
@ -104,6 +104,8 @@ static int sched_create_user(struct user_struct *up)
 	if (IS_ERR(up->tg))
 		rc = -ENOMEM;
 	set_tg_uid(up);
 	return rc;
 }