Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: sched: do not count frozen tasks toward load sched: refresh MAINTAINERS entry sched: Print sched_group::__cpu_power in sched_domain_debug cpuacct: add per-cgroup utime/stime statistics posixtimers, sched: Fix posix clock monotonicity sched_rt: don't allocate cpumask in fastpath cpuacct: make cpuacct hierarchy walk in cpuacct_charge() safe when rcupreempt is used -v2
2024-12-28 03:36:19 +00:00 · 2009-04-09 10:37:28 -07:00 · 2009-04-09 10:37:28 -07:00 · 17b2e9bf27
commit 17b2e9bf27
parent 422a253483 e3c8ca8336
7 changed files with 178 additions and 34 deletions
--- a/Documentation/cgroups/cpuacct.txt
+++ b/Documentation/cgroups/cpuacct.txt
@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell
 process (bash) into it. CPU time consumed by this bash and its children
 can be obtained from g1/cpuacct.usage and the same is accumulated in
 /cgroups/cpuacct.usage also.
 cpuacct.stat file lists a few statistics which further divide the
 CPU time obtained by the cgroup into user and system times. Currently
 the following statistics are supported:
 user: Time spent by tasks of the cgroup in user mode.
 system: Time spent by tasks of the cgroup in kernel mode.
 user and system are in USER_HZ unit.
 cpuacct controller uses percpu_counter interface to collect user and
 system times. This has two side effects:
 - It is theoretically possible to see wrong values for user and system times.
  This is because percpu_counter_read() on 32bit systems isn't safe
  against concurrent writes.
 - It is possible to see slightly outdated values for user and system times
  due to the batch processing nature of percpu_counter.
--- a/4
+++ b/4
@ -3873,8 +3873,8 @@ S:	Maintained
 SCHEDULER
 P:	Ingo Molnar
 M:	mingo@elte.hu
-P:	Robert Love    [the preemptible kernel bits]
+P:	Peter Zijlstra
-M:	rml@tech9.net
+M:	peterz@infradead.org
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -205,7 +205,8 @@ extern unsigned long long time_sync_thresh;
 #define task_is_stopped_or_traced(task)	\
 			((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
 #define task_contributes_to_load(task)	\
-				((task->state & TASK_UNINTERRUPTIBLE) != 0)
+				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
 				 (task->flags & PF_FROZEN) == 0)
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 		cpu->cpu = virt_ticks(p);
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
+		cpu->sched = task_sched_runtime(p);
 		break;
 	}
 	return 0;
@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 {
 	struct task_cputime cputime;
 	thread_group_cputime(p, &cputime);
 	switch (CPUCLOCK_WHICH(which_clock)) {
 	default:
 		return -EINVAL;
 	case CPUCLOCK_PROF:
 		thread_group_cputime(p, &cputime);
 		cpu->cpu = cputime_add(cputime.utime, cputime.stime);
 		break;
 	case CPUCLOCK_VIRT:
 		thread_group_cputime(p, &cputime);
 		cpu->cpu = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+		cpu->sched = thread_group_sched_runtime(p);
 		break;
 	}
 	return 0;
--- a/kernel/sched.c
+++ b/kernel/sched.c
@ -1418,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		   struct rq_iterator *iterator);
 #endif
 /* Time spent by the tasks of the cpu accounting group executing in ... */
 enum cpuacct_stat_index {
 	CPUACCT_STAT_USER,	/* ... user mode */
 	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */
 	CPUACCT_STAT_NSTATS,
 };
 #ifdef CONFIG_CGROUP_CPUACCT
 static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 static void cpuacct_update_stats(struct task_struct *tsk,
 		enum cpuacct_stat_index idx, cputime_t val);
 #else
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 static inline void cpuacct_update_stats(struct task_struct *tsk,
 		enum cpuacct_stat_index idx, cputime_t val) {}
 #endif
 static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@ -4511,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 /*
- * Return any ns on the sched_clock that have not yet been banked in
+ * Return any ns on the sched_clock that have not yet been accounted in
 * @p in case that task is currently running.
 *
 * Called with task_rq_lock() held on @rq.
 */
 static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
 {
 	u64 ns = 0;
 	if (task_current(rq, p)) {
 		update_rq_clock(rq);
 		ns = rq->clock - p->se.exec_start;
 		if ((s64)ns < 0)
 			ns = 0;
 	}
 	return ns;
 }
 unsigned long long task_delta_exec(struct task_struct *p)
 {
 	unsigned long flags;
@ -4521,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
 	u64 ns = 0;
 	rq = task_rq_lock(p, &flags);
 	ns = do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, &flags);
-	if (task_current(rq, p)) {
+	return ns;
-		u64 delta_exec;
+}
-		update_rq_clock(rq);
+/*
-		delta_exec = rq->clock - p->se.exec_start;
+ * Return accounted runtime for the task.
-		if ((s64)delta_exec > 0)
+ * In case the task is currently running, return the runtime plus current's
-			ns = delta_exec;
+ * pending runtime that have not been accounted yet.
-	}
+ */
 unsigned long long task_sched_runtime(struct task_struct *p)
 {
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns = 0;
 	rq = task_rq_lock(p, &flags);
 	ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, &flags);
 	return ns;
 }
 /*
 * Return sum_exec_runtime for the thread group.
 * In case the task is currently running, return the sum plus current's
 * pending runtime that have not been accounted yet.
 *
 * Note that the thread group might have other running tasks as well,
 * so the return value not includes other pending runtime that other
 * running tasks might have.
 */
 unsigned long long thread_group_sched_runtime(struct task_struct *p)
 {
 	struct task_cputime totals;
 	unsigned long flags;
 	struct rq *rq;
 	u64 ns;
 	rq = task_rq_lock(p, &flags);
 	thread_group_cputime(p, &totals);
 	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
 	task_rq_unlock(rq, &flags);
 	return ns;
@ -4559,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 		cpustat->nice = cputime64_add(cpustat->nice, tmp);
 	else
 		cpustat->user = cputime64_add(cpustat->user, tmp);
 	cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
 	/* Account for user time used */
 	acct_update_integrals(p);
 }
@ -4620,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 	else
 		cpustat->system = cputime64_add(cpustat->system, tmp);
 	cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
 	/* Account for system time used */
 	acct_update_integrals(p);
 }
@ -7302,7 +7367,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 		cpumask_or(groupmask, groupmask, sched_group_cpus(group));
 		cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
-		printk(KERN_CONT " %s", str);
+		printk(KERN_CONT " %s (__cpu_power = %d)", str,
 						group->__cpu_power);
 		group = group->next;
 	} while (group != sd->groups);
@ -9925,6 +9991,7 @@ struct cpuacct {
 	struct cgroup_subsys_state css;
 	/* cpuusage holds pointer to a u64-type object on every cpu */
 	u64 *cpuusage;
 	struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
 	struct cpuacct *parent;
 };
@ -9949,20 +10016,32 @@ static struct cgroup_subsys_state *cpuacct_create(
 	struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
 	int i;
 	if (!ca)
-		return ERR_PTR(-ENOMEM);
+		goto out;
 	ca->cpuusage = alloc_percpu(u64);
-	if (!ca->cpuusage) {
+	if (!ca->cpuusage)
-		kfree(ca);
+		goto out_free_ca;
-		return ERR_PTR(-ENOMEM);
+
-	}
+	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
 		if (percpu_counter_init(&ca->cpustat[i], 0))
 			goto out_free_counters;
 	if (cgrp->parent)
 		ca->parent = cgroup_ca(cgrp->parent);
 	return &ca->css;
 out_free_counters:
 	while (--i >= 0)
 		percpu_counter_destroy(&ca->cpustat[i]);
 	free_percpu(ca->cpuusage);
 out_free_ca:
 	kfree(ca);
 out:
 	return ERR_PTR(-ENOMEM);
 }
 /* destroy an existing cpu accounting group */
@ -9970,7 +10049,10 @@ static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	int i;
 	for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
 		percpu_counter_destroy(&ca->cpustat[i]);
 	free_percpu(ca->cpuusage);
 	kfree(ca);
 }
@ -10057,6 +10139,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
 	return 0;
 }
 static const char *cpuacct_stat_desc[] = {
 	[CPUACCT_STAT_USER] = "user",
 	[CPUACCT_STAT_SYSTEM] = "system",
 };
 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
 		struct cgroup_map_cb *cb)
 {
 	struct cpuacct *ca = cgroup_ca(cgrp);
 	int i;
 	for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
 		s64 val = percpu_counter_read(&ca->cpustat[i]);
 		val = cputime64_to_clock_t(val);
 		cb->fill(cb, cpuacct_stat_desc[i], val);
 	}
 	return 0;
 }
 static struct cftype files[] = {
 	{
 		.name = "usage",
@ -10067,7 +10168,10 @@ static struct cftype files[] = {
 		.name = "usage_percpu",
 		.read_seq_string = cpuacct_percpu_seq_read,
 	},
-
+	{
 		.name = "stat",
 		.read_map = cpuacct_stats_show,
 	},
 };
 static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@ -10089,12 +10193,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 		return;
 	cpu = task_cpu(tsk);
 	rcu_read_lock();
 	ca = task_ca(tsk);
 	for (; ca; ca = ca->parent) {
 		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 	rcu_read_unlock();
 }
 /*
 * Charge the system/user time to the task's accounting group.
 */
 static void cpuacct_update_stats(struct task_struct *tsk,
 		enum cpuacct_stat_index idx, cputime_t val)
 {
 	struct cpuacct *ca;
 	if (unlikely(!cpuacct_subsys.active))
 		return;
 	rcu_read_lock();
 	ca = task_ca(tsk);
 	do {
 		percpu_counter_add(&ca->cpustat[idx], val);
 		ca = ca->parent;
 	} while (ca);
 	rcu_read_unlock();
 }
 struct cgroup_subsys cpuacct_subsys = {
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@ -55,7 +55,7 @@ static int convert_prio(int prio)
 * cpupri_find - find the best (lowest-pri) CPU in the system
 * @cp: The cpupri context
 * @p: The task
- * @lowest_mask: A mask to fill in with selected CPUs
+ * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
 *
 * Note: This function returns the recommended CPUs as calculated during the
 * current invokation.  By the time the call returns, the CPUs may have in
@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 		if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
 			continue;
-		cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
+		if (lowest_mask)
 			cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
 		return 1;
 	}
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
 	cpumask_var_t mask;
 	if (rq->curr->rt.nr_cpus_allowed == 1)
 		return;
-	if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
+	if (p->rt.nr_cpus_allowed != 1
 	    && cpupri_find(&rq->rd->cpupri, p, NULL))
 		return;
-	if (p->rt.nr_cpus_allowed != 1
+	if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
-	    && cpupri_find(&rq->rd->cpupri, p, mask))
+		return;
 		goto free;
 	if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
 		goto free;
 	/*
 	 * There appears to be other cpus that can accept
@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 	 */
 	requeue_task_rt(rq, p, 1);
 	resched_task(rq->curr);
 free:
 	free_cpumask_var(mask);
 }
 #endif /* CONFIG_SMP */