diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 149f62ba14a..e11f7728ec6 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -448,6 +448,8 @@ running once the system is up.
Format: [,]
See also Documentation/networking/decnet.txt.
+ delayacct [KNL] Enable per-task delay accounting
+
dhash_entries= [KNL]
Set number of hash buckets for dentry cache.
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
new file mode 100644
index 00000000000..9572cfa1f12
--- /dev/null
+++ b/include/linux/delayacct.h
@@ -0,0 +1,69 @@
+/* delayacct.h - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_DELAYACCT_H
+#define _LINUX_DELAYACCT_H
+
+#include
+
+#ifdef CONFIG_TASK_DELAY_ACCT
+
+extern int delayacct_on; /* Delay accounting turned on/off */
+extern kmem_cache_t *delayacct_cache;
+extern void delayacct_init(void);
+extern void __delayacct_tsk_init(struct task_struct *);
+extern void __delayacct_tsk_exit(struct task_struct *);
+
+static inline void delayacct_set_flag(int flag)
+{
+ if (current->delays)
+ current->delays->flags |= flag;
+}
+
+static inline void delayacct_clear_flag(int flag)
+{
+ if (current->delays)
+ current->delays->flags &= ~flag;
+}
+
+static inline void delayacct_tsk_init(struct task_struct *tsk)
+{
+ /* reinitialize in case parent's non-null pointer was dup'ed*/
+ tsk->delays = NULL;
+ if (unlikely(delayacct_on))
+ __delayacct_tsk_init(tsk);
+}
+
+static inline void delayacct_tsk_exit(struct task_struct *tsk)
+{
+ if (tsk->delays)
+ __delayacct_tsk_exit(tsk);
+}
+
+#else
+static inline void delayacct_set_flag(int flag)
+{}
+static inline void delayacct_clear_flag(int flag)
+{}
+static inline void delayacct_init(void)
+{}
+static inline void delayacct_tsk_init(struct task_struct *tsk)
+{}
+static inline void delayacct_tsk_exit(struct task_struct *tsk)
+{}
+#endif /* CONFIG_TASK_DELAY_ACCT */
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1c876e27ff9..7a54e62763c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -552,6 +552,23 @@ struct sched_info {
extern struct file_operations proc_schedstat_operations;
#endif
+#ifdef CONFIG_TASK_DELAY_ACCT
+struct task_delay_info {
+ spinlock_t lock;
+ unsigned int flags; /* Private per-task flags */
+
+ /* For each stat XXX, add following, aligned appropriately
+ *
+ * struct timespec XXX_start, XXX_end;
+ * u64 XXX_delay;
+ * u32 XXX_count;
+ *
+ * Atomicity of updates to XXX_delay, XXX_count protected by
+ * single lock above (split into XXX_lock if contention is an issue).
+ */
+};
+#endif
+
enum idle_type
{
SCHED_IDLE,
@@ -945,6 +962,9 @@ struct task_struct {
* cache last used pipe for splice
*/
struct pipe_inode_info *splice_pipe;
+#ifdef CONFIG_TASK_DELAY_ACCT
+ struct task_delay_info *delays;
+#endif
};
static inline pid_t process_group(struct task_struct *tsk)
diff --git a/include/linux/time.h b/include/linux/time.h
index c05f8bb9a32..a5b739967b7 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -70,6 +70,18 @@ extern unsigned long mktime(const unsigned int year, const unsigned int mon,
extern void set_normalized_timespec(struct timespec *ts, time_t sec, long nsec);
+/*
+ * sub = lhs - rhs, in normalized form
+ */
+static inline struct timespec timespec_sub(struct timespec lhs,
+ struct timespec rhs)
+{
+ struct timespec ts_delta;
+ set_normalized_timespec(&ts_delta, lhs.tv_sec - rhs.tv_sec,
+ lhs.tv_nsec - rhs.tv_nsec);
+ return ts_delta;
+}
+
/*
* Returns true if the timespec is norm, false if denorm:
*/
diff --git a/init/Kconfig b/init/Kconfig
index a5b073a103e..90498a3e53d 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -158,6 +158,16 @@ config BSD_PROCESS_ACCT_V3
for processing it. A preliminary version of these tools is available
at .
+config TASK_DELAY_ACCT
+ bool "Enable per-task delay accounting (EXPERIMENTAL)"
+ help
+ Collect information on time spent by a task waiting for system
+ resources like cpu, synchronous block I/O completion and swapping
+ in pages. Such statistics can help in setting a task's priorities
+ relative to other tasks for cpu, io, rss limits etc.
+
+ Say N if unsure.
+
config SYSCTL
bool "Sysctl support" if EMBEDDED
default y
diff --git a/init/main.c b/init/main.c
index 628b8e9e841..9e8e8c15214 100644
--- a/init/main.c
+++ b/init/main.c
@@ -41,6 +41,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -574,6 +575,7 @@ asmlinkage void __init start_kernel(void)
proc_root_init();
#endif
cpuset_init();
+ delayacct_init();
check_bugs();
diff --git a/kernel/Makefile b/kernel/Makefile
index 47dbcd570cd..87bb34cc893 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -48,6 +48,7 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
# According to Alan Modra , the -fno-omit-frame-pointer is
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 00000000000..fbf7f228495
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,87 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include
+#include
+#include
+#include
+#include
+
+int delayacct_on __read_mostly; /* Delay accounting turned on/off */
+kmem_cache_t *delayacct_cache;
+
+static int __init delayacct_setup_enable(char *str)
+{
+ delayacct_on = 1;
+ return 1;
+}
+__setup("delayacct", delayacct_setup_enable);
+
+void delayacct_init(void)
+{
+ delayacct_cache = kmem_cache_create("delayacct_cache",
+ sizeof(struct task_delay_info),
+ 0,
+ SLAB_PANIC,
+ NULL, NULL);
+ delayacct_tsk_init(&init_task);
+}
+
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+ tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+ if (tsk->delays)
+ spin_lock_init(&tsk->delays->lock);
+}
+
+void __delayacct_tsk_exit(struct task_struct *tsk)
+{
+ kmem_cache_free(delayacct_cache, tsk->delays);
+ tsk->delays = NULL;
+}
+
+/*
+ * Start accounting for a delay statistic using
+ * its starting timestamp (@start)
+ */
+
+static inline void delayacct_start(struct timespec *start)
+{
+ do_posix_clock_monotonic_gettime(start);
+}
+
+/*
+ * Finish delay accounting for a statistic using
+ * its timestamps (@start, @end), accumalator (@total) and @count
+ */
+
+static void delayacct_end(struct timespec *start, struct timespec *end,
+ u64 *total, u32 *count)
+{
+ struct timespec ts;
+ s64 ns;
+
+ do_posix_clock_monotonic_gettime(end);
+ ts = timespec_sub(*end, *start);
+ ns = timespec_to_ns(&ts);
+ if (ns < 0)
+ return;
+
+ spin_lock(¤t->delays->lock);
+ *total += ns;
+ (*count)++;
+ spin_unlock(¤t->delays->lock);
+}
+
diff --git a/kernel/exit.c b/kernel/exit.c
index 6664c084783..3c2cf91defa 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -900,6 +901,7 @@ fastcall NORET_TYPE void do_exit(long code)
#endif
if (unlikely(tsk->audit_context))
audit_free(tsk);
+ delayacct_tsk_exit(tsk);
exit_mm(tsk);
if (group_dead)
diff --git a/kernel/fork.c b/kernel/fork.c
index 926e5a68ea9..451cfd35bf2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -43,6 +43,7 @@
#include
#include
#include
+#include
#include
#include
@@ -1000,6 +1001,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_put_domain;
p->did_exec = 0;
+ delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
copy_flags(clone_flags, p);
p->pid = pid;
retval = -EFAULT;