From e4e364e865b382f9d99c7fc230ec2ce7df21257a Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Fri, 31 Mar 2006 02:30:52 -0800 Subject: [PATCH] [PATCH] cpuset: memory migration interaction fix Fix memory migration so that it works regardless of what cpuset the invoking task is in. If a task invoked a memory migration, by doing one of: 1) writing a different nodemask to a cpuset 'mems' file, or 2) writing a tasks pid to a different cpuset's 'tasks' file, where the cpuset had its 'memory_migrate' option turned on, then the allocation of the new pages for the migrated task(s) was constrained by the invoking tasks cpuset. If this task wasn't in a cpuset that allowed the requested memory nodes, the memory migration would happen to some other nodes that were in that invoking tasks cpuset. This was usually surprising and puzzling behaviour: Why didn't the pages move? Why did the pages move -there-? To fix this, temporarilly change the invoking tasks 'mems_allowed' task_struct field to the nodes the migrating tasks is moving to, so that new pages can be allocated there. Signed-off-by: Paul Jackson Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 57 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 52 insertions(+), 5 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index bf42381a419..72248d1b9e3 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -833,6 +833,55 @@ static int update_cpumask(struct cpuset *cs, char *buf) return 0; } +/* + * cpuset_migrate_mm + * + * Migrate memory region from one set of nodes to another. + * + * Temporarilly set tasks mems_allowed to target nodes of migration, + * so that the migration code can allocate pages on these nodes. + * + * Call holding manage_mutex, so our current->cpuset won't change + * during this call, as manage_mutex holds off any attach_task() + * calls. Therefore we don't need to take task_lock around the + * call to guarantee_online_mems(), as we know no one is changing + * our tasks cpuset. + * + * Hold callback_mutex around the two modifications of our tasks + * mems_allowed to synchronize with cpuset_mems_allowed(). + * + * While the mm_struct we are migrating is typically from some + * other task, the task_struct mems_allowed that we are hacking + * is for our current task, which must allocate new pages for that + * migrating memory region. + * + * We call cpuset_update_task_memory_state() before hacking + * our tasks mems_allowed, so that we are assured of being in + * sync with our tasks cpuset, and in particular, callbacks to + * cpuset_update_task_memory_state() from nested page allocations + * won't see any mismatch of our cpuset and task mems_generation + * values, so won't overwrite our hacked tasks mems_allowed + * nodemask. + */ + +static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to) +{ + struct task_struct *tsk = current; + + cpuset_update_task_memory_state(); + + mutex_lock(&callback_mutex); + tsk->mems_allowed = *to; + mutex_unlock(&callback_mutex); + + do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + + mutex_lock(&callback_mutex); + guarantee_online_mems(tsk->cpuset, &tsk->mems_allowed); + mutex_unlock(&callback_mutex); +} + /* * Handle user request to change the 'mems' memory placement * of a cpuset. Needs to validate the request, update the @@ -945,10 +994,8 @@ static int update_nodemask(struct cpuset *cs, char *buf) struct mm_struct *mm = mmarray[i]; mpol_rebind_mm(mm, &cs->mems_allowed); - if (migrate) { - do_migrate_pages(mm, &oldmem, &cs->mems_allowed, - MPOL_MF_MOVE_ALL); - } + if (migrate) + cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); mmput(mm); } @@ -1184,7 +1231,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) if (mm) { mpol_rebind_mm(mm, &to); if (is_memory_migrate(cs)) - do_migrate_pages(mm, &from, &to, MPOL_MF_MOVE_ALL); + cpuset_migrate_mm(mm, &from, &to); mmput(mm); }