mirror of
https://github.com/adulau/aha.git
synced 2024-12-28 19:56:18 +00:00
cgroups: mechanism to process each task in a cgroup
Provide cgroup_scan_tasks(), which iterates through every task in a cgroup, calling a test function and a process function for each. And call the process function without holding the css_set_lock lock. The idea is David Rientjes', predicting that such a function will make it much easier in the future to extend things that require access to each task in a cgroup without holding the lock, [akpm@linux-foundation.org: cleanup] [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Cliff Wickman <cpw@sgi.com> Cc: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Acked-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
dfc05c259e
commit
31a7df01fd
2 changed files with 200 additions and 12 deletions
|
@ -14,6 +14,7 @@
|
||||||
#include <linux/nodemask.h>
|
#include <linux/nodemask.h>
|
||||||
#include <linux/rcupdate.h>
|
#include <linux/rcupdate.h>
|
||||||
#include <linux/cgroupstats.h>
|
#include <linux/cgroupstats.h>
|
||||||
|
#include <linux/prio_heap.h>
|
||||||
|
|
||||||
#ifdef CONFIG_CGROUPS
|
#ifdef CONFIG_CGROUPS
|
||||||
|
|
||||||
|
@ -207,6 +208,14 @@ struct cftype {
|
||||||
int (*release) (struct inode *inode, struct file *file);
|
int (*release) (struct inode *inode, struct file *file);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct cgroup_scanner {
|
||||||
|
struct cgroup *cg;
|
||||||
|
int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
|
||||||
|
void (*process_task)(struct task_struct *p,
|
||||||
|
struct cgroup_scanner *scan);
|
||||||
|
struct ptr_heap *heap;
|
||||||
|
};
|
||||||
|
|
||||||
/* Add a new file to the given cgroup directory. Should only be
|
/* Add a new file to the given cgroup directory. Should only be
|
||||||
* called by subsystems from within a populate() method */
|
* called by subsystems from within a populate() method */
|
||||||
int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
|
int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
|
||||||
|
@ -299,11 +308,16 @@ struct cgroup_iter {
|
||||||
* returns NULL or until you want to end the iteration
|
* returns NULL or until you want to end the iteration
|
||||||
*
|
*
|
||||||
* 3) call cgroup_iter_end() to destroy the iterator.
|
* 3) call cgroup_iter_end() to destroy the iterator.
|
||||||
|
*
|
||||||
|
* Or, call cgroup_scan_tasks() to iterate through every task in a cpuset.
|
||||||
|
* - cgroup_scan_tasks() holds the css_set_lock when calling the test_task()
|
||||||
|
* callback, but not while calling the process_task() callback.
|
||||||
*/
|
*/
|
||||||
void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it);
|
void cgroup_iter_start(struct cgroup *cont, struct cgroup_iter *it);
|
||||||
struct task_struct *cgroup_iter_next(struct cgroup *cont,
|
struct task_struct *cgroup_iter_next(struct cgroup *cont,
|
||||||
struct cgroup_iter *it);
|
struct cgroup_iter *it);
|
||||||
void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
|
void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
|
||||||
|
int cgroup_scan_tasks(struct cgroup_scanner *scan);
|
||||||
|
|
||||||
#else /* !CONFIG_CGROUPS */
|
#else /* !CONFIG_CGROUPS */
|
||||||
|
|
||||||
|
|
198
kernel/cgroup.c
198
kernel/cgroup.c
|
@ -1695,6 +1695,29 @@ static void cgroup_advance_iter(struct cgroup *cgrp,
|
||||||
it->task = cg->tasks.next;
|
it->task = cg->tasks.next;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* To reduce the fork() overhead for systems that are not actually
|
||||||
|
* using their cgroups capability, we don't maintain the lists running
|
||||||
|
* through each css_set to its tasks until we see the list actually
|
||||||
|
* used - in other words after the first call to cgroup_iter_start().
|
||||||
|
*
|
||||||
|
* The tasklist_lock is not held here, as do_each_thread() and
|
||||||
|
* while_each_thread() are protected by RCU.
|
||||||
|
*/
|
||||||
|
void cgroup_enable_task_cg_lists(void)
|
||||||
|
{
|
||||||
|
struct task_struct *p, *g;
|
||||||
|
write_lock(&css_set_lock);
|
||||||
|
use_task_css_set_links = 1;
|
||||||
|
do_each_thread(g, p) {
|
||||||
|
task_lock(p);
|
||||||
|
if (list_empty(&p->cg_list))
|
||||||
|
list_add(&p->cg_list, &p->cgroups->tasks);
|
||||||
|
task_unlock(p);
|
||||||
|
} while_each_thread(g, p);
|
||||||
|
write_unlock(&css_set_lock);
|
||||||
|
}
|
||||||
|
|
||||||
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
|
void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
|
||||||
{
|
{
|
||||||
/*
|
/*
|
||||||
|
@ -1702,18 +1725,9 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it)
|
||||||
* we need to enable the list linking each css_set to its
|
* we need to enable the list linking each css_set to its
|
||||||
* tasks, and fix up all existing tasks.
|
* tasks, and fix up all existing tasks.
|
||||||
*/
|
*/
|
||||||
if (!use_task_css_set_links) {
|
if (!use_task_css_set_links)
|
||||||
struct task_struct *p, *g;
|
cgroup_enable_task_cg_lists();
|
||||||
write_lock(&css_set_lock);
|
|
||||||
use_task_css_set_links = 1;
|
|
||||||
do_each_thread(g, p) {
|
|
||||||
task_lock(p);
|
|
||||||
if (list_empty(&p->cg_list))
|
|
||||||
list_add(&p->cg_list, &p->cgroups->tasks);
|
|
||||||
task_unlock(p);
|
|
||||||
} while_each_thread(g, p);
|
|
||||||
write_unlock(&css_set_lock);
|
|
||||||
}
|
|
||||||
read_lock(&css_set_lock);
|
read_lock(&css_set_lock);
|
||||||
it->cg_link = &cgrp->css_sets;
|
it->cg_link = &cgrp->css_sets;
|
||||||
cgroup_advance_iter(cgrp, it);
|
cgroup_advance_iter(cgrp, it);
|
||||||
|
@ -1746,6 +1760,166 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it)
|
||||||
read_unlock(&css_set_lock);
|
read_unlock(&css_set_lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline int started_after_time(struct task_struct *t1,
|
||||||
|
struct timespec *time,
|
||||||
|
struct task_struct *t2)
|
||||||
|
{
|
||||||
|
int start_diff = timespec_compare(&t1->start_time, time);
|
||||||
|
if (start_diff > 0) {
|
||||||
|
return 1;
|
||||||
|
} else if (start_diff < 0) {
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* Arbitrarily, if two processes started at the same
|
||||||
|
* time, we'll say that the lower pointer value
|
||||||
|
* started first. Note that t2 may have exited by now
|
||||||
|
* so this may not be a valid pointer any longer, but
|
||||||
|
* that's fine - it still serves to distinguish
|
||||||
|
* between two tasks started (effectively) simultaneously.
|
||||||
|
*/
|
||||||
|
return t1 > t2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This function is a callback from heap_insert() and is used to order
|
||||||
|
* the heap.
|
||||||
|
* In this case we order the heap in descending task start time.
|
||||||
|
*/
|
||||||
|
static inline int started_after(void *p1, void *p2)
|
||||||
|
{
|
||||||
|
struct task_struct *t1 = p1;
|
||||||
|
struct task_struct *t2 = p2;
|
||||||
|
return started_after_time(t1, &t2->start_time, t2);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* cgroup_scan_tasks - iterate though all the tasks in a cgroup
|
||||||
|
* @scan: struct cgroup_scanner containing arguments for the scan
|
||||||
|
*
|
||||||
|
* Arguments include pointers to callback functions test_task() and
|
||||||
|
* process_task().
|
||||||
|
* Iterate through all the tasks in a cgroup, calling test_task() for each,
|
||||||
|
* and if it returns true, call process_task() for it also.
|
||||||
|
* The test_task pointer may be NULL, meaning always true (select all tasks).
|
||||||
|
* Effectively duplicates cgroup_iter_{start,next,end}()
|
||||||
|
* but does not lock css_set_lock for the call to process_task().
|
||||||
|
* The struct cgroup_scanner may be embedded in any structure of the caller's
|
||||||
|
* creation.
|
||||||
|
* It is guaranteed that process_task() will act on every task that
|
||||||
|
* is a member of the cgroup for the duration of this call. This
|
||||||
|
* function may or may not call process_task() for tasks that exit
|
||||||
|
* or move to a different cgroup during the call, or are forked or
|
||||||
|
* move into the cgroup during the call.
|
||||||
|
*
|
||||||
|
* Note that test_task() may be called with locks held, and may in some
|
||||||
|
* situations be called multiple times for the same task, so it should
|
||||||
|
* be cheap.
|
||||||
|
* If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been
|
||||||
|
* pre-allocated and will be used for heap operations (and its "gt" member will
|
||||||
|
* be overwritten), else a temporary heap will be used (allocation of which
|
||||||
|
* may cause this function to fail).
|
||||||
|
*/
|
||||||
|
int cgroup_scan_tasks(struct cgroup_scanner *scan)
|
||||||
|
{
|
||||||
|
int retval, i;
|
||||||
|
struct cgroup_iter it;
|
||||||
|
struct task_struct *p, *dropped;
|
||||||
|
/* Never dereference latest_task, since it's not refcounted */
|
||||||
|
struct task_struct *latest_task = NULL;
|
||||||
|
struct ptr_heap tmp_heap;
|
||||||
|
struct ptr_heap *heap;
|
||||||
|
struct timespec latest_time = { 0, 0 };
|
||||||
|
|
||||||
|
if (scan->heap) {
|
||||||
|
/* The caller supplied our heap and pre-allocated its memory */
|
||||||
|
heap = scan->heap;
|
||||||
|
heap->gt = &started_after;
|
||||||
|
} else {
|
||||||
|
/* We need to allocate our own heap memory */
|
||||||
|
heap = &tmp_heap;
|
||||||
|
retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after);
|
||||||
|
if (retval)
|
||||||
|
/* cannot allocate the heap */
|
||||||
|
return retval;
|
||||||
|
}
|
||||||
|
|
||||||
|
again:
|
||||||
|
/*
|
||||||
|
* Scan tasks in the cgroup, using the scanner's "test_task" callback
|
||||||
|
* to determine which are of interest, and using the scanner's
|
||||||
|
* "process_task" callback to process any of them that need an update.
|
||||||
|
* Since we don't want to hold any locks during the task updates,
|
||||||
|
* gather tasks to be processed in a heap structure.
|
||||||
|
* The heap is sorted by descending task start time.
|
||||||
|
* If the statically-sized heap fills up, we overflow tasks that
|
||||||
|
* started later, and in future iterations only consider tasks that
|
||||||
|
* started after the latest task in the previous pass. This
|
||||||
|
* guarantees forward progress and that we don't miss any tasks.
|
||||||
|
*/
|
||||||
|
heap->size = 0;
|
||||||
|
cgroup_iter_start(scan->cg, &it);
|
||||||
|
while ((p = cgroup_iter_next(scan->cg, &it))) {
|
||||||
|
/*
|
||||||
|
* Only affect tasks that qualify per the caller's callback,
|
||||||
|
* if he provided one
|
||||||
|
*/
|
||||||
|
if (scan->test_task && !scan->test_task(p, scan))
|
||||||
|
continue;
|
||||||
|
/*
|
||||||
|
* Only process tasks that started after the last task
|
||||||
|
* we processed
|
||||||
|
*/
|
||||||
|
if (!started_after_time(p, &latest_time, latest_task))
|
||||||
|
continue;
|
||||||
|
dropped = heap_insert(heap, p);
|
||||||
|
if (dropped == NULL) {
|
||||||
|
/*
|
||||||
|
* The new task was inserted; the heap wasn't
|
||||||
|
* previously full
|
||||||
|
*/
|
||||||
|
get_task_struct(p);
|
||||||
|
} else if (dropped != p) {
|
||||||
|
/*
|
||||||
|
* The new task was inserted, and pushed out a
|
||||||
|
* different task
|
||||||
|
*/
|
||||||
|
get_task_struct(p);
|
||||||
|
put_task_struct(dropped);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* Else the new task was newer than anything already in
|
||||||
|
* the heap and wasn't inserted
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
cgroup_iter_end(scan->cg, &it);
|
||||||
|
|
||||||
|
if (heap->size) {
|
||||||
|
for (i = 0; i < heap->size; i++) {
|
||||||
|
struct task_struct *p = heap->ptrs[i];
|
||||||
|
if (i == 0) {
|
||||||
|
latest_time = p->start_time;
|
||||||
|
latest_task = p;
|
||||||
|
}
|
||||||
|
/* Process the task per the caller's callback */
|
||||||
|
scan->process_task(p, scan);
|
||||||
|
put_task_struct(p);
|
||||||
|
}
|
||||||
|
/*
|
||||||
|
* If we had to process any tasks at all, scan again
|
||||||
|
* in case some of them were in the middle of forking
|
||||||
|
* children that didn't get processed.
|
||||||
|
* Not the most efficient way to do it, but it avoids
|
||||||
|
* having to take callback_mutex in the fork path
|
||||||
|
*/
|
||||||
|
goto again;
|
||||||
|
}
|
||||||
|
if (heap == &tmp_heap)
|
||||||
|
heap_free(&tmp_heap);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Stuff for reading the 'tasks' file.
|
* Stuff for reading the 'tasks' file.
|
||||||
*
|
*
|
||||||
|
|
Loading…
Reference in a new issue