From 5cb350baf580017da38199625b7365b1763d7180 Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Mon, 15 Oct 2007 17:00:14 +0200 Subject: [PATCH] sched: group scheduling, sysfs tunables Add tunables in sysfs to modify a user's cpu share. A directory is created in sysfs for each new user in the system. /sys/kernel/uids//cpu_share Reading this file returns the cpu shares granted for the user. Writing into this file modifies the cpu share for the user. Only an administrator is allowed to modify a user's cpu share. Ex: # cd /sys/kernel/uids/ # cat 512/cpu_share 1024 # echo 2048 > 512/cpu_share # cat 512/cpu_share 2048 # Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar --- Documentation/sched-design-CFS.txt | 67 ++++++++ include/linux/sched.h | 11 ++ kernel/ksysfs.c | 8 + kernel/sched.c | 14 +- kernel/sched_debug.c | 48 ------ kernel/user.c | 256 ++++++++++++++++++++++++----- 6 files changed, 317 insertions(+), 87 deletions(-) diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt index 84901e7c050..88bcb876733 100644 --- a/Documentation/sched-design-CFS.txt +++ b/Documentation/sched-design-CFS.txt @@ -117,3 +117,70 @@ Some implementation details: iterators of the scheduling modules are used. The balancing code got quite a bit simpler as a result. + +Group scheduler extension to CFS +================================ + +Normally the scheduler operates on individual tasks and strives to provide +fair CPU time to each task. Sometimes, it may be desirable to group tasks +and provide fair CPU time to each such task group. For example, it may +be desirable to first provide fair CPU time to each user on the system +and then to each task belonging to a user. + +CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets +SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such +groups. At present, there are two (mutually exclusive) mechanisms to group +tasks for CPU bandwidth control purpose: + + - Based on user id (CONFIG_FAIR_USER_SCHED) + In this option, tasks are grouped according to their user id. + - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED) + This options lets the administrator create arbitrary groups + of tasks, using the "cgroup" pseudo filesystem. See + Documentation/cgroups.txt for more information about this + filesystem. + +Only one of these options to group tasks can be chosen and not both. + +Group scheduler tunables: + +When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for +each new user and a "cpu_share" file is added in that directory. + + # cd /sys/kernel/uids + # cat 512/cpu_share # Display user 512's CPU share + 1024 + # echo 2048 > 512/cpu_share # Modify user 512's CPU share + # cat 512/cpu_share # Display user 512's CPU share + 2048 + # + +CPU bandwidth between two users are divided in the ratio of their CPU shares. +For ex: if you would like user "root" to get twice the bandwidth of user +"guest", then set the cpu_share for both the users such that "root"'s +cpu_share is twice "guest"'s cpu_share + + +When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created +for each group created using the pseudo filesystem. See example steps +below to create task groups and modify their CPU share using the "cgroups" +pseudo filesystem + + # mkdir /dev/cpuctl + # mount -t cgroup -ocpu none /dev/cpuctl + # cd /dev/cpuctl + + # mkdir multimedia # create "multimedia" group of tasks + # mkdir browser # create "browser" group of tasks + + # #Configure the multimedia group to receive twice the CPU bandwidth + # #that of browser group + + # echo 2048 > multimedia/cpu.shares + # echo 1024 > browser/cpu.shares + + # firefox & # Launch firefox and move it to "browser" group + # echo > browser/tasks + + # #Launch gmplayer (or your favourite movie player) + # echo > multimedia/tasks diff --git a/include/linux/sched.h b/include/linux/sched.h index 3cddbfc0c91..04233c8974d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -87,6 +87,7 @@ struct sched_param { #include #include #include +#include #include @@ -599,9 +600,18 @@ struct user_struct { #ifdef CONFIG_FAIR_USER_SCHED struct task_group *tg; + struct kset kset; + struct subsys_attribute user_attr; + struct work_struct work; #endif }; +#ifdef CONFIG_FAIR_USER_SCHED +extern int uids_kobject_init(void); +#else +static inline int uids_kobject_init(void) { return 0; } +#endif + extern struct user_struct *find_user(uid_t); extern struct user_struct root_user; @@ -1848,6 +1858,7 @@ extern struct task_group *sched_create_group(void); extern void sched_destroy_group(struct task_group *tg); extern void sched_move_task(struct task_struct *tsk); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); +extern unsigned long sched_group_shares(struct task_group *tg); #endif diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index d0e5c48e18c..6046939d080 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -14,6 +14,7 @@ #include #include #include +#include #define KERNEL_ATTR_RO(_name) \ static struct subsys_attribute _name##_attr = __ATTR_RO(_name) @@ -116,6 +117,13 @@ static int __init ksysfs_init(void) ¬es_attr); } + /* + * Create "/sys/kernel/uids" directory and corresponding root user's + * directory under it. + */ + if (!error) + error = uids_kobject_init(); + return error; } diff --git a/kernel/sched.c b/kernel/sched.c index a3c3ec825f4..9ac99896db8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -162,6 +162,8 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares; + /* spinlock to serialize modification to shares */ + spinlock_t lock; }; /* Default task group's sched entity on each cpu */ @@ -6533,6 +6535,7 @@ void __init sched_init(void) se->parent = NULL; } init_task_group.shares = init_task_group_load; + spin_lock_init(&init_task_group.lock); #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) @@ -6777,6 +6780,7 @@ struct task_group *sched_create_group(void) } tg->shares = NICE_0_LOAD; + spin_lock_init(&tg->lock); return tg; @@ -6897,8 +6901,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; + spin_lock(&tg->lock); if (tg->shares == shares) - return 0; + goto done; /* return -EINVAL if the new value is not sane */ @@ -6906,7 +6911,14 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) for_each_possible_cpu(i) set_se_shares(tg->se[i], shares); +done: + spin_unlock(&tg->lock); return 0; } +unsigned long sched_group_shares(struct task_group *tg) +{ + return tg->shares; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 6f87b31d233..0aab455a7b4 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -231,45 +231,6 @@ static void sysrq_sched_debug_show(void) sched_debug_show(NULL, NULL); } -#ifdef CONFIG_FAIR_USER_SCHED - -static DEFINE_MUTEX(root_user_share_mutex); - -static int -root_user_share_read_proc(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - return sprintf(page, "%d\n", init_task_group_load); -} - -static int -root_user_share_write_proc(struct file *file, const char __user *buffer, - unsigned long count, void *data) -{ - unsigned long shares; - char kbuf[sizeof(unsigned long)+1]; - int rc = 0; - - if (copy_from_user(kbuf, buffer, sizeof(kbuf))) - return -EFAULT; - - shares = simple_strtoul(kbuf, NULL, 0); - - if (!shares) - shares = NICE_0_LOAD; - - mutex_lock(&root_user_share_mutex); - - init_task_group_load = shares; - rc = sched_group_set_shares(&init_task_group, shares); - - mutex_unlock(&root_user_share_mutex); - - return (rc < 0 ? rc : count); -} - -#endif /* CONFIG_FAIR_USER_SCHED */ - static int sched_debug_open(struct inode *inode, struct file *filp) { return single_open(filp, sched_debug_show, NULL); @@ -292,15 +253,6 @@ static int __init init_sched_debug_procfs(void) pe->proc_fops = &sched_debug_fops; -#ifdef CONFIG_FAIR_USER_SCHED - pe = create_proc_entry("root_user_cpu_share", 0644, NULL); - if (!pe) - return -ENOMEM; - - pe->read_proc = root_user_share_read_proc; - pe->write_proc = root_user_share_write_proc; -#endif - return 0; } diff --git a/kernel/user.c b/kernel/user.c index 0c9a7870d08..74cadea8466 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -55,7 +55,41 @@ struct user_struct root_user = { #endif }; +/* + * These routines must be called with the uidhash spinlock held! + */ +static inline void uid_hash_insert(struct user_struct *up, + struct hlist_head *hashent) +{ + hlist_add_head(&up->uidhash_node, hashent); +} + +static inline void uid_hash_remove(struct user_struct *up) +{ + hlist_del_init(&up->uidhash_node); +} + +static inline struct user_struct *uid_hash_find(uid_t uid, + struct hlist_head *hashent) +{ + struct user_struct *user; + struct hlist_node *h; + + hlist_for_each_entry(user, h, hashent, uidhash_node) { + if (user->uid == uid) { + atomic_inc(&user->__count); + return user; + } + } + + return NULL; +} + #ifdef CONFIG_FAIR_USER_SCHED + +static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ +static DEFINE_MUTEX(uids_mutex); + static void sched_destroy_user(struct user_struct *up) { sched_destroy_group(up->tg); @@ -77,42 +111,173 @@ static void sched_switch_user(struct task_struct *p) sched_move_task(p); } +static inline void uids_mutex_lock(void) +{ + mutex_lock(&uids_mutex); +} + +static inline void uids_mutex_unlock(void) +{ + mutex_unlock(&uids_mutex); +} + +/* return cpu shares held by the user */ +ssize_t cpu_shares_show(struct kset *kset, char *buffer) +{ + struct user_struct *up = container_of(kset, struct user_struct, kset); + + return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); +} + +/* modify cpu shares held by the user */ +ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size) +{ + struct user_struct *up = container_of(kset, struct user_struct, kset); + unsigned long shares; + int rc; + + sscanf(buffer, "%lu", &shares); + + rc = sched_group_set_shares(up->tg, shares); + + return (rc ? rc : size); +} + +static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) +{ + sa->attr.name = name; + sa->attr.mode = mode; + sa->show = cpu_shares_show; + sa->store = cpu_shares_store; +} + +/* Create "/sys/kernel/uids/" directory and + * "/sys/kernel/uids//cpu_share" file for this user. + */ +static int user_kobject_create(struct user_struct *up) +{ + struct kset *kset = &up->kset; + struct kobject *kobj = &kset->kobj; + int error; + + memset(kset, 0, sizeof(struct kset)); + kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ + kobject_set_name(kobj, "%d", up->uid); + kset_init(kset); + user_attr_init(&up->user_attr, "cpu_share", 0644); + + error = kobject_add(kobj); + if (error) + goto done; + + error = sysfs_create_file(kobj, &up->user_attr.attr); + if (error) + kobject_del(kobj); + +done: + return error; +} + +/* create these in sysfs filesystem: + * "/sys/kernel/uids" directory + * "/sys/kernel/uids/0" directory (for root user) + * "/sys/kernel/uids/0/cpu_share" file (for root user) + */ +int __init uids_kobject_init(void) +{ + int error; + + /* create under /sys/kernel dir */ + uids_kobject.parent = &kernel_subsys.kobj; + kobject_set_name(&uids_kobject, "uids"); + kobject_init(&uids_kobject); + + error = kobject_add(&uids_kobject); + if (!error) + error = user_kobject_create(&root_user); + + return error; +} + +/* work function to remove sysfs directory for a user and free up + * corresponding structures. + */ +static void remove_user_sysfs_dir(struct work_struct *w) +{ + struct user_struct *up = container_of(w, struct user_struct, work); + struct kobject *kobj = &up->kset.kobj; + unsigned long flags; + int remove_user = 0; + + /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del() + * atomic. + */ + uids_mutex_lock(); + + local_irq_save(flags); + + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { + uid_hash_remove(up); + remove_user = 1; + spin_unlock_irqrestore(&uidhash_lock, flags); + } else { + local_irq_restore(flags); + } + + if (!remove_user) + goto done; + + sysfs_remove_file(kobj, &up->user_attr.attr); + kobject_del(kobj); + + sched_destroy_user(up); + key_put(up->uid_keyring); + key_put(up->session_keyring); + kmem_cache_free(uid_cachep, up); + +done: + uids_mutex_unlock(); +} + +/* IRQs are disabled and uidhash_lock is held upon function entry. + * IRQ state (as stored in flags) is restored and uidhash_lock released + * upon function exit. + */ +static inline void free_user(struct user_struct *up, unsigned long flags) +{ + /* restore back the count */ + atomic_inc(&up->__count); + spin_unlock_irqrestore(&uidhash_lock, flags); + + INIT_WORK(&up->work, remove_user_sysfs_dir); + schedule_work(&up->work); +} + #else /* CONFIG_FAIR_USER_SCHED */ static void sched_destroy_user(struct user_struct *up) { } static int sched_create_user(struct user_struct *up) { return 0; } static void sched_switch_user(struct task_struct *p) { } +static inline int user_kobject_create(struct user_struct *up) { return 0; } +static inline void uids_mutex_lock(void) { } +static inline void uids_mutex_unlock(void) { } + +/* IRQs are disabled and uidhash_lock is held upon function entry. + * IRQ state (as stored in flags) is restored and uidhash_lock released + * upon function exit. + */ +static inline void free_user(struct user_struct *up, unsigned long flags) +{ + uid_hash_remove(up); + spin_unlock_irqrestore(&uidhash_lock, flags); + sched_destroy_user(up); + key_put(up->uid_keyring); + key_put(up->session_keyring); + kmem_cache_free(uid_cachep, up); +} #endif /* CONFIG_FAIR_USER_SCHED */ -/* - * These routines must be called with the uidhash spinlock held! - */ -static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) -{ - hlist_add_head(&up->uidhash_node, hashent); -} - -static inline void uid_hash_remove(struct user_struct *up) -{ - hlist_del_init(&up->uidhash_node); -} - -static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) -{ - struct user_struct *user; - struct hlist_node *h; - - hlist_for_each_entry(user, h, hashent, uidhash_node) { - if(user->uid == uid) { - atomic_inc(&user->__count); - return user; - } - } - - return NULL; -} - /* * Locate the user_struct for the passed UID. If found, take a ref on it. The * caller must undo that ref with free_uid(). @@ -139,16 +304,10 @@ void free_uid(struct user_struct *up) return; local_irq_save(flags); - if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) { - uid_hash_remove(up); - spin_unlock_irqrestore(&uidhash_lock, flags); - sched_destroy_user(up); - key_put(up->uid_keyring); - key_put(up->session_keyring); - kmem_cache_free(uid_cachep, up); - } else { + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) + free_user(up, flags); + else local_irq_restore(flags); - } } struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) @@ -156,6 +315,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) struct hlist_head *hashent = uidhashentry(ns, uid); struct user_struct *up; + /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() + * atomic. + */ + uids_mutex_lock(); + spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); spin_unlock_irq(&uidhash_lock); @@ -191,6 +355,15 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) return NULL; } + if (user_kobject_create(new)) { + sched_destroy_user(new); + key_put(new->uid_keyring); + key_put(new->session_keyring); + kmem_cache_free(uid_cachep, new); + uids_mutex_unlock(); + return NULL; + } + /* * Before adding this, check whether we raced * on adding the same user already.. @@ -198,7 +371,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) spin_lock_irq(&uidhash_lock); up = uid_hash_find(uid, hashent); if (up) { - sched_destroy_user(new); + /* This case is not possible when CONFIG_FAIR_USER_SCHED + * is defined, since we serialize alloc_uid() using + * uids_mutex. Hence no need to call + * sched_destroy_user() or remove_user_sysfs_dir(). + */ key_put(new->uid_keyring); key_put(new->session_keyring); kmem_cache_free(uid_cachep, new); @@ -209,6 +386,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) spin_unlock_irq(&uidhash_lock); } + + uids_mutex_unlock(); + return up; }