diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 149f62ba14a..e11f7728ec6 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -448,6 +448,8 @@ running once the system is up. Format: [,] See also Documentation/networking/decnet.txt. + delayacct [KNL] Enable per-task delay accounting + dhash_entries= [KNL] Set number of hash buckets for dentry cache. diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h new file mode 100644 index 00000000000..9572cfa1f12 --- /dev/null +++ b/include/linux/delayacct.h @@ -0,0 +1,69 @@ +/* delayacct.h - per-task delay accounting + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + * + */ + +#ifndef _LINUX_DELAYACCT_H +#define _LINUX_DELAYACCT_H + +#include + +#ifdef CONFIG_TASK_DELAY_ACCT + +extern int delayacct_on; /* Delay accounting turned on/off */ +extern kmem_cache_t *delayacct_cache; +extern void delayacct_init(void); +extern void __delayacct_tsk_init(struct task_struct *); +extern void __delayacct_tsk_exit(struct task_struct *); + +static inline void delayacct_set_flag(int flag) +{ + if (current->delays) + current->delays->flags |= flag; +} + +static inline void delayacct_clear_flag(int flag) +{ + if (current->delays) + current->delays->flags &= ~flag; +} + +static inline void delayacct_tsk_init(struct task_struct *tsk) +{ + /* reinitialize in case parent's non-null pointer was dup'ed*/ + tsk->delays = NULL; + if (unlikely(delayacct_on)) + __delayacct_tsk_init(tsk); +} + +static inline void delayacct_tsk_exit(struct task_struct *tsk) +{ + if (tsk->delays) + __delayacct_tsk_exit(tsk); +} + +#else +static inline void delayacct_set_flag(int flag) +{} +static inline void delayacct_clear_flag(int flag) +{} +static inline void delayacct_init(void) +{} +static inline void delayacct_tsk_init(struct task_struct *tsk) +{} +static inline void delayacct_tsk_exit(struct task_struct *tsk) +{} +#endif /* CONFIG_TASK_DELAY_ACCT */ + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c876e27ff9..7a54e62763c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -552,6 +552,23 @@ struct sched_info { extern struct file_operations proc_schedstat_operations; #endif +#ifdef CONFIG_TASK_DELAY_ACCT +struct task_delay_info { + spinlock_t lock; + unsigned int flags; /* Private per-task flags */ + + /* For each stat XXX, add following, aligned appropriately + * + * struct timespec XXX_start, XXX_end; + * u64 XXX_delay; + * u32 XXX_count; + * + * Atomicity of updates to XXX_delay, XXX_count protected by + * single lock above (split into XXX_lock if contention is an issue). + */ +}; +#endif + enum idle_type { SCHED_IDLE, @@ -945,6 +962,9 @@ struct task_struct { * cache last used pipe for splice */ struct pipe_inode_info *splice_pipe; +#ifdef CONFIG_TASK_DELAY_ACCT + struct task_delay_info *delays; +#endif }; static inline pid_t process_group(struct task_struct *tsk) diff --git a/include/linux/time.h b/include/linux/time.h index c05f8bb9a32..a5b739967b7 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -70,6 +70,18 @@ extern unsigned long mktime(const unsigned int year, const unsigned int mon, extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec); +/* + * sub = lhs - rhs, in normalized form + */ +static inline struct timespec timespec_sub(struct timespec lhs, + struct timespec rhs) +{ + struct timespec ts_delta; + set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec, + lhs.tv_nsec - rhs.tv_nsec); + return ts_delta; +} + /* * Returns true if the timespec is norm, false if denorm: */ diff --git a/init/Kconfig b/init/Kconfig index a5b073a103e..90498a3e53d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -158,6 +158,16 @@ config BSD_PROCESS_ACCT_V3 for processing it. A preliminary version of these tools is available at . +config TASK_DELAY_ACCT + bool "Enable per-task delay accounting (EXPERIMENTAL)" + help + Collect information on time spent by a task waiting for system + resources like cpu, synchronous block I/O completion and swapping + in pages. Such statistics can help in setting a task's priorities + relative to other tasks for cpu, io, rss limits etc. + + Say N if unsure. + config SYSCTL bool "Sysctl support" if EMBEDDED default y diff --git a/init/main.c b/init/main.c index 628b8e9e841..9e8e8c15214 100644 --- a/init/main.c +++ b/init/main.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -574,6 +575,7 @@ asmlinkage void __init start_kernel(void) proc_root_init(); #endif cpuset_init(); + delayacct_init(); check_bugs(); diff --git a/kernel/Makefile b/kernel/Makefile index 47dbcd570cd..87bb34cc893 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/delayacct.c b/kernel/delayacct.c new file mode 100644 index 00000000000..fbf7f228495 --- /dev/null +++ b/kernel/delayacct.c @@ -0,0 +1,87 @@ +/* delayacct.c - per-task delay accounting + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include + +int delayacct_on __read_mostly; /* Delay accounting turned on/off */ +kmem_cache_t *delayacct_cache; + +static int __init delayacct_setup_enable(char *str) +{ + delayacct_on = 1; + return 1; +} +__setup("delayacct", delayacct_setup_enable); + +void delayacct_init(void) +{ + delayacct_cache = kmem_cache_create("delayacct_cache", + sizeof(struct task_delay_info), + 0, + SLAB_PANIC, + NULL, NULL); + delayacct_tsk_init(&init_task); +} + +void __delayacct_tsk_init(struct task_struct *tsk) +{ + tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); + if (tsk->delays) + spin_lock_init(&tsk->delays->lock); +} + +void __delayacct_tsk_exit(struct task_struct *tsk) +{ + kmem_cache_free(delayacct_cache, tsk->delays); + tsk->delays = NULL; +} + +/* + * Start accounting for a delay statistic using + * its starting timestamp (@start) + */ + +static inline void delayacct_start(struct timespec *start) +{ + do_posix_clock_monotonic_gettime(start); +} + +/* + * Finish delay accounting for a statistic using + * its timestamps (@start, @end), accumalator (@total) and @count + */ + +static void delayacct_end(struct timespec *start, struct timespec *end, + u64 *total, u32 *count) +{ + struct timespec ts; + s64 ns; + + do_posix_clock_monotonic_gettime(end); + ts = timespec_sub(*end, *start); + ns = timespec_to_ns(&ts); + if (ns < 0) + return; + + spin_lock(¤t->delays->lock); + *total += ns; + (*count)++; + spin_unlock(¤t->delays->lock); +} + diff --git a/kernel/exit.c b/kernel/exit.c index 6664c084783..3c2cf91defa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -900,6 +901,7 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); + delayacct_tsk_exit(tsk); exit_mm(tsk); if (group_dead) diff --git a/kernel/fork.c b/kernel/fork.c index 926e5a68ea9..451cfd35bf2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include @@ -1000,6 +1001,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_put_domain; p->did_exec = 0; + delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); p->pid = pid; retval = -EFAULT;