cpuset: rewrite update_tasks_nodemask()

This patch uses cgroup_scan_tasks() to rebind tasks' vmas to new cpuset's
mems_allowed.

Not only simplify the code largely, but also avoid allocating an array to
hold mm pointers of all the tasks in the cpuset.  This array can be big
(size > PAGESIZE) if we have lots of tasks in that cpuset, thus has a
chance to fail the allocation when under memory stress.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
Li Zefan 2009-04-02 16:57:51 -07:00 committed by Linus Torvalds
parent bd1a8ab73e
commit 3b6766fe66

View file

@ -1026,6 +1026,31 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
mutex_unlock(&callback_mutex); mutex_unlock(&callback_mutex);
} }
/*
* Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
* nodes if memory_migrate flag is set. Called with cgroup_mutex held.
*/
static void cpuset_change_nodemask(struct task_struct *p,
struct cgroup_scanner *scan)
{
struct mm_struct *mm;
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
mm = get_task_mm(p);
if (!mm)
return;
cs = cgroup_cs(scan->cg);
migrate = is_memory_migrate(cs);
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
mmput(mm);
}
static void *cpuset_being_rebound; static void *cpuset_being_rebound;
/** /**
@ -1038,88 +1063,32 @@ static void *cpuset_being_rebound;
*/ */
static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
{ {
struct task_struct *p;
struct mm_struct **mmarray;
int i, n, ntasks;
int migrate;
int fudge;
struct cgroup_iter it;
int retval; int retval;
struct cgroup_scanner scan;
cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
fudge = 10; /* spare mmarray[] slots */ scan.cg = cs->css.cgroup;
fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */ scan.test_task = NULL;
retval = -ENOMEM; scan.process_task = cpuset_change_nodemask;
scan.heap = NULL;
scan.data = (nodemask_t *)oldmem;
/* /*
* Allocate mmarray[] to hold mm reference for each task * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
* in cpuset cs. Can't kmalloc GFP_KERNEL while holding * take while holding tasklist_lock. Forks can happen - the
* tasklist_lock. We could use GFP_ATOMIC, but with a * mpol_dup() cpuset_being_rebound check will catch such forks,
* few more lines of code, we can retry until we get a big * and rebind their vma mempolicies too. Because we still hold
* enough mmarray[] w/o using GFP_ATOMIC. * the global cgroup_mutex, we know that no other rebind effort
*/ * will be contending for the global variable cpuset_being_rebound.
while (1) {
ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
ntasks += fudge;
mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
if (!mmarray)
goto done;
read_lock(&tasklist_lock); /* block fork */
if (cgroup_task_count(cs->css.cgroup) <= ntasks)
break; /* got enough */
read_unlock(&tasklist_lock); /* try again */
kfree(mmarray);
}
n = 0;
/* Load up mmarray[] with mm reference for each task in cpuset. */
cgroup_iter_start(cs->css.cgroup, &it);
while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
struct mm_struct *mm;
if (n >= ntasks) {
printk(KERN_WARNING
"Cpuset mempolicy rebind incomplete.\n");
break;
}
mm = get_task_mm(p);
if (!mm)
continue;
mmarray[n++] = mm;
}
cgroup_iter_end(cs->css.cgroup, &it);
read_unlock(&tasklist_lock);
/*
* Now that we've dropped the tasklist spinlock, we can
* rebind the vma mempolicies of each mm in mmarray[] to their
* new cpuset, and release that mm. The mpol_rebind_mm()
* call takes mmap_sem, which we couldn't take while holding
* tasklist_lock. Forks can happen again now - the mpol_dup()
* cpuset_being_rebound check will catch such forks, and rebind
* their vma mempolicies too. Because we still hold the global
* cgroup_mutex, we know that no other rebind effort will
* be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm() * It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes. * is idempotent. Also migrate pages in each mm to new nodes.
*/ */
migrate = is_memory_migrate(cs); retval = cgroup_scan_tasks(&scan);
for (i = 0; i < n; i++) {
struct mm_struct *mm = mmarray[i];
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
mmput(mm);
}
/* We're done rebinding vmas to this cpuset's new mems_allowed. */ /* We're done rebinding vmas to this cpuset's new mems_allowed. */
kfree(mmarray);
cpuset_being_rebound = NULL; cpuset_being_rebound = NULL;
retval = 0;
done:
return retval; return retval;
} }