diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 74357cb9bc7..c7ac77e873b 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -150,6 +150,16 @@ extern void numa_policy_init(void); extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); +extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); +#define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x)) + +#ifdef CONFIG_CPUSET +#define current_cpuset_is_being_rebound() \ + (cpuset_being_rebound == current->cpuset) +#else +#define current_cpuset_is_being_rebound() 0 +#endif + extern struct mempolicy default_policy; extern struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr); @@ -165,6 +175,8 @@ static inline void check_highest_zone(int k) int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags); +extern void *cpuset_being_rebound; /* Trigger mpol_copy vma rebind */ + #else struct mempolicy {}; @@ -234,6 +246,12 @@ static inline void mpol_rebind_task(struct task_struct *tsk, { } +static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) +{ +} + +#define set_cpuset_being_rebound(x) do {} while (0) + static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6004719f26e..19f87565be1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf) } /* + * Handle user request to change the 'mems' memory placement + * of a cpuset. Needs to validate the request, update the + * cpusets mems_allowed and mems_generation, and for each + * task in the cpuset, rebind any vma mempolicies. + * * Call with manage_sem held. May take callback_sem during call. + * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, + * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, char *buf) { struct cpuset trialcs; + struct task_struct *g, *p; + struct mm_struct **mmarray; + int i, n, ntasks; + int fudge; int retval; trialcs = *cs; @@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf) cs->mems_generation = atomic_read(&cpuset_mems_generation); up(&callback_sem); + set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ + + fudge = 10; /* spare mmarray[] slots */ + fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ + retval = -ENOMEM; + + /* + * Allocate mmarray[] to hold mm reference for each task + * in cpuset cs. Can't kmalloc GFP_KERNEL while holding + * tasklist_lock. We could use GFP_ATOMIC, but with a + * few more lines of code, we can retry until we get a big + * enough mmarray[] w/o using GFP_ATOMIC. + */ + while (1) { + ntasks = atomic_read(&cs->count); /* guess */ + ntasks += fudge; + mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); + if (!mmarray) + goto done; + write_lock_irq(&tasklist_lock); /* block fork */ + if (atomic_read(&cs->count) <= ntasks) + break; /* got enough */ + write_unlock_irq(&tasklist_lock); /* try again */ + kfree(mmarray); + } + + n = 0; + + /* Load up mmarray[] with mm reference for each task in cpuset. */ + do_each_thread(g, p) { + struct mm_struct *mm; + + if (n >= ntasks) { + printk(KERN_WARNING + "Cpuset mempolicy rebind incomplete.\n"); + continue; + } + if (p->cpuset != cs) + continue; + mm = get_task_mm(p); + if (!mm) + continue; + mmarray[n++] = mm; + } while_each_thread(g, p); + write_unlock_irq(&tasklist_lock); + + /* + * Now that we've dropped the tasklist spinlock, we can + * rebind the vma mempolicies of each mm in mmarray[] to their + * new cpuset, and release that mm. The mpol_rebind_mm() + * call takes mmap_sem, which we couldn't take while holding + * tasklist_lock. Forks can happen again now - the mpol_copy() + * cpuset_being_rebound check will catch such forks, and rebind + * their vma mempolicies too. Because we still hold the global + * cpuset manage_sem, we know that no other rebind effort will + * be contending for the global variable cpuset_being_rebound. + * It's ok if we rebind the same mm twice; mpol_rebind_mm() + * is idempotent. + */ + for (i = 0; i < n; i++) { + struct mm_struct *mm = mmarray[i]; + + mpol_rebind_mm(mm, &cs->mems_allowed); + mmput(mm); + } + + /* We're done rebinding vma's to this cpusets new mems_allowed. */ + kfree(mmarray); + set_cpuset_being_rebound(NULL); + retval = 0; done: return retval; } @@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) struct cpuset *oldcs; cpumask_t cpus; nodemask_t from, to; + struct mm_struct *mm; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) to = cs->mems_allowed; up(&callback_sem); + + mm = get_task_mm(tsk); + if (mm) { + mpol_rebind_mm(mm, &to); + mmput(mm); + } + if (is_memory_migrate(cs)) do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); put_task_struct(tsk); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c39bd86f4ea..1850d0aef4a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1131,6 +1131,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) } EXPORT_SYMBOL(alloc_pages_current); +/* + * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it + * rebinds the mempolicy its copying by calling mpol_rebind_policy() + * with the mems_allowed returned by cpuset_mems_allowed(). This + * keeps mempolicies cpuset relative after its cpuset moves. See + * further kernel/cpuset.c update_nodemask(). + */ +void *cpuset_being_rebound; + /* Slow path of a mempolicy copy */ struct mempolicy *__mpol_copy(struct mempolicy *old) { @@ -1138,6 +1147,10 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + if (current_cpuset_is_being_rebound()) { + nodemask_t mems = cpuset_mems_allowed(current); + mpol_rebind_policy(old, &mems); + } *new = *old; atomic_set(&new->refcnt, 1); if (new->policy == MPOL_BIND) { @@ -1480,6 +1493,22 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) mpol_rebind_policy(tsk->mempolicy, new); } +/* + * Rebind each vma in mm to new nodemask. + * + * Call holding a reference to mm. Takes mm->mmap_sem during call. + */ + +void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) +{ + struct vm_area_struct *vma; + + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + mpol_rebind_policy(vma->vm_policy, new); + up_write(&mm->mmap_sem); +} + /* * Display pages allocated per node and memory policy via /proc. */