Merge branch 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip

* 'sched-fixes-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip:
  sched: do not count frozen tasks toward load
  sched: refresh MAINTAINERS entry
  sched: Print sched_group::__cpu_power in sched_domain_debug
  cpuacct: add per-cgroup utime/stime statistics
  posixtimers, sched: Fix posix clock monotonicity
  sched_rt: don't allocate cpumask in fastpath
  cpuacct: make cpuacct hierarchy walk in cpuacct_charge() safe when rcupreempt is used -v2
This commit is contained in:
Linus Torvalds 2009-04-09 10:37:28 -07:00
commit 17b2e9bf27
7 changed files with 178 additions and 34 deletions

View file

@ -30,3 +30,21 @@ The above steps create a new group g1 and move the current shell
process (bash) into it. CPU time consumed by this bash and its children process (bash) into it. CPU time consumed by this bash and its children
can be obtained from g1/cpuacct.usage and the same is accumulated in can be obtained from g1/cpuacct.usage and the same is accumulated in
/cgroups/cpuacct.usage also. /cgroups/cpuacct.usage also.
cpuacct.stat file lists a few statistics which further divide the
CPU time obtained by the cgroup into user and system times. Currently
the following statistics are supported:
user: Time spent by tasks of the cgroup in user mode.
system: Time spent by tasks of the cgroup in kernel mode.
user and system are in USER_HZ unit.
cpuacct controller uses percpu_counter interface to collect user and
system times. This has two side effects:
- It is theoretically possible to see wrong values for user and system times.
This is because percpu_counter_read() on 32bit systems isn't safe
against concurrent writes.
- It is possible to see slightly outdated values for user and system times
due to the batch processing nature of percpu_counter.

View file

@ -3873,8 +3873,8 @@ S: Maintained
SCHEDULER SCHEDULER
P: Ingo Molnar P: Ingo Molnar
M: mingo@elte.hu M: mingo@elte.hu
P: Robert Love [the preemptible kernel bits] P: Peter Zijlstra
M: rml@tech9.net M: peterz@infradead.org
L: linux-kernel@vger.kernel.org L: linux-kernel@vger.kernel.org
S: Maintained S: Maintained

View file

@ -205,7 +205,8 @@ extern unsigned long long time_sync_thresh;
#define task_is_stopped_or_traced(task) \ #define task_is_stopped_or_traced(task) \
((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
#define task_contributes_to_load(task) \ #define task_contributes_to_load(task) \
((task->state & TASK_UNINTERRUPTIBLE) != 0) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
(task->flags & PF_FROZEN) == 0)
#define __set_task_state(tsk, state_value) \ #define __set_task_state(tsk, state_value) \
do { (tsk)->state = (state_value); } while (0) do { (tsk)->state = (state_value); } while (0)

View file

@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
cpu->cpu = virt_ticks(p); cpu->cpu = virt_ticks(p);
break; break;
case CPUCLOCK_SCHED: case CPUCLOCK_SCHED:
cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); cpu->sched = task_sched_runtime(p);
break; break;
} }
return 0; return 0;
@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
{ {
struct task_cputime cputime; struct task_cputime cputime;
thread_group_cputime(p, &cputime);
switch (CPUCLOCK_WHICH(which_clock)) { switch (CPUCLOCK_WHICH(which_clock)) {
default: default:
return -EINVAL; return -EINVAL;
case CPUCLOCK_PROF: case CPUCLOCK_PROF:
thread_group_cputime(p, &cputime);
cpu->cpu = cputime_add(cputime.utime, cputime.stime); cpu->cpu = cputime_add(cputime.utime, cputime.stime);
break; break;
case CPUCLOCK_VIRT: case CPUCLOCK_VIRT:
thread_group_cputime(p, &cputime);
cpu->cpu = cputime.utime; cpu->cpu = cputime.utime;
break; break;
case CPUCLOCK_SCHED: case CPUCLOCK_SCHED:
cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); cpu->sched = thread_group_sched_runtime(p);
break; break;
} }
return 0; return 0;

View file

@ -1418,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
struct rq_iterator *iterator); struct rq_iterator *iterator);
#endif #endif
/* Time spent by the tasks of the cpu accounting group executing in ... */
enum cpuacct_stat_index {
CPUACCT_STAT_USER, /* ... user mode */
CPUACCT_STAT_SYSTEM, /* ... kernel mode */
CPUACCT_STAT_NSTATS,
};
#ifdef CONFIG_CGROUP_CPUACCT #ifdef CONFIG_CGROUP_CPUACCT
static void cpuacct_charge(struct task_struct *tsk, u64 cputime); static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
static void cpuacct_update_stats(struct task_struct *tsk,
enum cpuacct_stat_index idx, cputime_t val);
#else #else
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
static inline void cpuacct_update_stats(struct task_struct *tsk,
enum cpuacct_stat_index idx, cputime_t val) {}
#endif #endif
static inline void inc_cpu_load(struct rq *rq, unsigned long load) static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@ -4511,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kstat);
/* /*
* Return any ns on the sched_clock that have not yet been banked in * Return any ns on the sched_clock that have not yet been accounted in
* @p in case that task is currently running. * @p in case that task is currently running.
*
* Called with task_rq_lock() held on @rq.
*/ */
static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
{
u64 ns = 0;
if (task_current(rq, p)) {
update_rq_clock(rq);
ns = rq->clock - p->se.exec_start;
if ((s64)ns < 0)
ns = 0;
}
return ns;
}
unsigned long long task_delta_exec(struct task_struct *p) unsigned long long task_delta_exec(struct task_struct *p)
{ {
unsigned long flags; unsigned long flags;
@ -4521,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
u64 ns = 0; u64 ns = 0;
rq = task_rq_lock(p, &flags); rq = task_rq_lock(p, &flags);
ns = do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags);
if (task_current(rq, p)) { return ns;
u64 delta_exec; }
update_rq_clock(rq); /*
delta_exec = rq->clock - p->se.exec_start; * Return accounted runtime for the task.
if ((s64)delta_exec > 0) * In case the task is currently running, return the runtime plus current's
ns = delta_exec; * pending runtime that have not been accounted yet.
} */
unsigned long long task_sched_runtime(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
u64 ns = 0;
rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags);
return ns;
}
/*
* Return sum_exec_runtime for the thread group.
* In case the task is currently running, return the sum plus current's
* pending runtime that have not been accounted yet.
*
* Note that the thread group might have other running tasks as well,
* so the return value not includes other pending runtime that other
* running tasks might have.
*/
unsigned long long thread_group_sched_runtime(struct task_struct *p)
{
struct task_cputime totals;
unsigned long flags;
struct rq *rq;
u64 ns;
rq = task_rq_lock(p, &flags);
thread_group_cputime(p, &totals);
ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
return ns; return ns;
@ -4559,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
cpustat->nice = cputime64_add(cpustat->nice, tmp); cpustat->nice = cputime64_add(cpustat->nice, tmp);
else else
cpustat->user = cputime64_add(cpustat->user, tmp); cpustat->user = cputime64_add(cpustat->user, tmp);
cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
/* Account for user time used */ /* Account for user time used */
acct_update_integrals(p); acct_update_integrals(p);
} }
@ -4620,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
else else
cpustat->system = cputime64_add(cpustat->system, tmp); cpustat->system = cputime64_add(cpustat->system, tmp);
cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
/* Account for system time used */ /* Account for system time used */
acct_update_integrals(p); acct_update_integrals(p);
} }
@ -7302,7 +7367,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpumask_or(groupmask, groupmask, sched_group_cpus(group)); cpumask_or(groupmask, groupmask, sched_group_cpus(group));
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str); printk(KERN_CONT " %s (__cpu_power = %d)", str,
group->__cpu_power);
group = group->next; group = group->next;
} while (group != sd->groups); } while (group != sd->groups);
@ -9925,6 +9991,7 @@ struct cpuacct {
struct cgroup_subsys_state css; struct cgroup_subsys_state css;
/* cpuusage holds pointer to a u64-type object on every cpu */ /* cpuusage holds pointer to a u64-type object on every cpu */
u64 *cpuusage; u64 *cpuusage;
struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
struct cpuacct *parent; struct cpuacct *parent;
}; };
@ -9949,20 +10016,32 @@ static struct cgroup_subsys_state *cpuacct_create(
struct cgroup_subsys *ss, struct cgroup *cgrp) struct cgroup_subsys *ss, struct cgroup *cgrp)
{ {
struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
int i;
if (!ca) if (!ca)
return ERR_PTR(-ENOMEM); goto out;
ca->cpuusage = alloc_percpu(u64); ca->cpuusage = alloc_percpu(u64);
if (!ca->cpuusage) { if (!ca->cpuusage)
kfree(ca); goto out_free_ca;
return ERR_PTR(-ENOMEM);
} for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
if (percpu_counter_init(&ca->cpustat[i], 0))
goto out_free_counters;
if (cgrp->parent) if (cgrp->parent)
ca->parent = cgroup_ca(cgrp->parent); ca->parent = cgroup_ca(cgrp->parent);
return &ca->css; return &ca->css;
out_free_counters:
while (--i >= 0)
percpu_counter_destroy(&ca->cpustat[i]);
free_percpu(ca->cpuusage);
out_free_ca:
kfree(ca);
out:
return ERR_PTR(-ENOMEM);
} }
/* destroy an existing cpu accounting group */ /* destroy an existing cpu accounting group */
@ -9970,7 +10049,10 @@ static void
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{ {
struct cpuacct *ca = cgroup_ca(cgrp); struct cpuacct *ca = cgroup_ca(cgrp);
int i;
for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
percpu_counter_destroy(&ca->cpustat[i]);
free_percpu(ca->cpuusage); free_percpu(ca->cpuusage);
kfree(ca); kfree(ca);
} }
@ -10057,6 +10139,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
return 0; return 0;
} }
static const char *cpuacct_stat_desc[] = {
[CPUACCT_STAT_USER] = "user",
[CPUACCT_STAT_SYSTEM] = "system",
};
static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct cpuacct *ca = cgroup_ca(cgrp);
int i;
for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
s64 val = percpu_counter_read(&ca->cpustat[i]);
val = cputime64_to_clock_t(val);
cb->fill(cb, cpuacct_stat_desc[i], val);
}
return 0;
}
static struct cftype files[] = { static struct cftype files[] = {
{ {
.name = "usage", .name = "usage",
@ -10067,7 +10168,10 @@ static struct cftype files[] = {
.name = "usage_percpu", .name = "usage_percpu",
.read_seq_string = cpuacct_percpu_seq_read, .read_seq_string = cpuacct_percpu_seq_read,
}, },
{
.name = "stat",
.read_map = cpuacct_stats_show,
},
}; };
static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@ -10089,12 +10193,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
return; return;
cpu = task_cpu(tsk); cpu = task_cpu(tsk);
rcu_read_lock();
ca = task_ca(tsk); ca = task_ca(tsk);
for (; ca; ca = ca->parent) { for (; ca; ca = ca->parent) {
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime; *cpuusage += cputime;
} }
rcu_read_unlock();
}
/*
* Charge the system/user time to the task's accounting group.
*/
static void cpuacct_update_stats(struct task_struct *tsk,
enum cpuacct_stat_index idx, cputime_t val)
{
struct cpuacct *ca;
if (unlikely(!cpuacct_subsys.active))
return;
rcu_read_lock();
ca = task_ca(tsk);
do {
percpu_counter_add(&ca->cpustat[idx], val);
ca = ca->parent;
} while (ca);
rcu_read_unlock();
} }
struct cgroup_subsys cpuacct_subsys = { struct cgroup_subsys cpuacct_subsys = {

View file

@ -55,7 +55,7 @@ static int convert_prio(int prio)
* cpupri_find - find the best (lowest-pri) CPU in the system * cpupri_find - find the best (lowest-pri) CPU in the system
* @cp: The cpupri context * @cp: The cpupri context
* @p: The task * @p: The task
* @lowest_mask: A mask to fill in with selected CPUs * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
* *
* Note: This function returns the recommended CPUs as calculated during the * Note: This function returns the recommended CPUs as calculated during the
* current invokation. By the time the call returns, the CPUs may have in * current invokation. By the time the call returns, the CPUs may have in
@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
continue; continue;
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); if (lowest_mask)
cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
return 1; return 1;
} }

View file

@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{ {
cpumask_var_t mask;
if (rq->curr->rt.nr_cpus_allowed == 1) if (rq->curr->rt.nr_cpus_allowed == 1)
return; return;
if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) if (p->rt.nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return; return;
if (p->rt.nr_cpus_allowed != 1 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
&& cpupri_find(&rq->rd->cpupri, p, mask)) return;
goto free;
if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask))
goto free;
/* /*
* There appears to be other cpus that can accept * There appears to be other cpus that can accept
@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
*/ */
requeue_task_rt(rq, p, 1); requeue_task_rt(rq, p, 1);
resched_task(rq->curr); resched_task(rq->curr);
free:
free_cpumask_var(mask);
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */