cpuset: rewrite update_tasks_nodemask()

This patch uses cgroup_scan_tasks() to rebind tasks' vmas to new cpuset's mems_allowed. Not only simplify the code largely, but also avoid allocating an array to hold mm pointers of all the tasks in the cpuset. This array can be big (size > PAGESIZE) if we have lots of tasks in that cpuset, thus has a chance to fail the allocation when under memory stress. Signed-off-by: Li Zefan <lizf@cn.fujitsu.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2024-12-28 03:36:19 +00:00 · 2009-04-02 16:57:51 -07:00 · 2009-04-02 16:57:51 -07:00 · 3b6766fe66
commit 3b6766fe66
parent bd1a8ab73e
1 changed files with 39 additions and 70 deletions
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@ -1026,6 +1026,31 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 	mutex_unlock(&callback_mutex);
 }
 /*
 * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
 * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
 */
 static void cpuset_change_nodemask(struct task_struct *p,
 				   struct cgroup_scanner *scan)
 {
 	struct mm_struct *mm;
 	struct cpuset *cs;
 	int migrate;
 	const nodemask_t *oldmem = scan->data;
 	mm = get_task_mm(p);
 	if (!mm)
 		return;
 	cs = cgroup_cs(scan->cg);
 	migrate = is_memory_migrate(cs);
 	mpol_rebind_mm(mm, &cs->mems_allowed);
 	if (migrate)
 		cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
 	mmput(mm);
 }
 static void *cpuset_being_rebound;
 /**
@ -1038,88 +1063,32 @@ static void *cpuset_being_rebound;
 */
 static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
 	struct task_struct *p;
 	struct mm_struct **mmarray;
 	int i, n, ntasks;
 	int migrate;
 	int fudge;
 	struct cgroup_iter it;
 	int retval;
 	struct cgroup_scanner scan;
 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
-	fudge = 10;				/* spare mmarray[] slots */
+	scan.cg = cs->css.cgroup;
-	fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
+	scan.test_task = NULL;
-	retval = -ENOMEM;
+	scan.process_task = cpuset_change_nodemask;
 	scan.heap = NULL;
 	scan.data = (nodemask_t *)oldmem;
 	/*
-	 * Allocate mmarray[] to hold mm reference for each task
+	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
-	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
+	 * take while holding tasklist_lock.  Forks can happen - the
-	 * tasklist_lock.  We could use GFP_ATOMIC, but with a
+	 * mpol_dup() cpuset_being_rebound check will catch such forks,
-	 * few more lines of code, we can retry until we get a big
+	 * and rebind their vma mempolicies too.  Because we still hold
-	 * enough mmarray[] w/o using GFP_ATOMIC.
+	 * the global cgroup_mutex, we know that no other rebind effort
-	 */
+	 * will be contending for the global variable cpuset_being_rebound.
 	while (1) {
 		ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
 		ntasks += fudge;
 		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
 		if (!mmarray)
 			goto done;
 		read_lock(&tasklist_lock);		/* block fork */
 		if (cgroup_task_count(cs->css.cgroup) <= ntasks)
 			break;				/* got enough */
 		read_unlock(&tasklist_lock);		/* try again */
 		kfree(mmarray);
 	}
 	n = 0;
 	/* Load up mmarray[] with mm reference for each task in cpuset. */
 	cgroup_iter_start(cs->css.cgroup, &it);
 	while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
 		struct mm_struct *mm;
 		if (n >= ntasks) {
 			printk(KERN_WARNING
 				"Cpuset mempolicy rebind incomplete.\n");
 			break;
 		}
 		mm = get_task_mm(p);
 		if (!mm)
 			continue;
 		mmarray[n++] = mm;
 	}
 	cgroup_iter_end(cs->css.cgroup, &it);
 	read_unlock(&tasklist_lock);
 	/*
 	 * Now that we've dropped the tasklist spinlock, we can
 	 * rebind the vma mempolicies of each mm in mmarray[] to their
 	 * new cpuset, and release that mm.  The mpol_rebind_mm()
 	 * call takes mmap_sem, which we couldn't take while holding
 	 * tasklist_lock.  Forks can happen again now - the mpol_dup()
 	 * cpuset_being_rebound check will catch such forks, and rebind
 	 * their vma mempolicies too.  Because we still hold the global
 	 * cgroup_mutex, we know that no other rebind effort will
 	 * be contending for the global variable cpuset_being_rebound.
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
-	migrate = is_memory_migrate(cs);
+	retval = cgroup_scan_tasks(&scan);
 	for (i = 0; i < n; i++) {
 		struct mm_struct *mm = mmarray[i];
 		mpol_rebind_mm(mm, &cs->mems_allowed);
 		if (migrate)
 			cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
 		mmput(mm);
 	}
 	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
 	kfree(mmarray);
 	cpuset_being_rebound = NULL;
-	retval = 0;
+
 done:
 	return retval;
 }