From 457533a7d3402d1d91fbc125c8bd1bd16dcd3cd4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 31 Dec 2008 15:11:37 +0100 Subject: [PATCH 1/6] [PATCH] fix scaled & unscaled cputime accounting The utimescaled / stimescaled fields in the task structure and the global cpustat should be set on all architectures. On s390 the calls to account_user_time_scaled and account_system_time_scaled never have been added. In addition system time that is accounted as guest time to the user time of a process is accounted to the scaled system time instead of the scaled user time. To fix the bugs and to prevent future forgetfulness this patch merges account_system_time_scaled into account_system_time and account_user_time_scaled into account_user_time. Cc: Benjamin Herrenschmidt Cc: Hidetoshi Seto Cc: Tony Luck Cc: Jeremy Fitzhardinge Cc: Chris Wright Cc: Michael Neuling Acked-by: Paul Mackerras Signed-off-by: Martin Schwidefsky --- arch/ia64/kernel/time.c | 12 ++++------- arch/powerpc/kernel/time.c | 7 ++----- arch/s390/kernel/vtime.c | 10 ++++----- include/linux/kernel_stat.h | 6 ++---- kernel/sched.c | 41 +++++++++++++++---------------------- kernel/time/tick-sched.c | 5 +++-- kernel/timer.c | 12 +++++------ 7 files changed, 37 insertions(+), 56 deletions(-) diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 65c10a42c88..4ee36781704 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -93,13 +93,11 @@ void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next) now = ia64_get_itc(); delta_stime = cycle_to_cputime(pi->ac_stime + (now - pi->ac_stamp)); - account_system_time(prev, 0, delta_stime); - account_system_time_scaled(prev, delta_stime); + account_system_time(prev, 0, delta_stime, delta_stime); if (pi->ac_utime) { delta_utime = cycle_to_cputime(pi->ac_utime); - account_user_time(prev, delta_utime); - account_user_time_scaled(prev, delta_utime); + account_user_time(prev, delta_utime, delta_utime); } pi->ac_stamp = ni->ac_stamp = now; @@ -122,8 +120,7 @@ void account_system_vtime(struct task_struct *tsk) now = ia64_get_itc(); delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); - account_system_time(tsk, 0, delta_stime); - account_system_time_scaled(tsk, delta_stime); + account_system_time(tsk, 0, delta_stime, delta_stime); ti->ac_stime = 0; ti->ac_stamp = now; @@ -143,8 +140,7 @@ void account_process_tick(struct task_struct *p, int user_tick) if (ti->ac_utime) { delta_utime = cycle_to_cputime(ti->ac_utime); - account_user_time(p, delta_utime); - account_user_time_scaled(p, delta_utime); + account_user_time(p, delta_utime, delta_utime); ti->ac_utime = 0; } } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index e1f3a514042..92650ccad2e 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -256,8 +256,7 @@ void account_system_vtime(struct task_struct *tsk) delta += sys_time; get_paca()->system_time = 0; } - account_system_time(tsk, 0, delta); - account_system_time_scaled(tsk, deltascaled); + account_system_time(tsk, 0, delta, deltascaled); per_cpu(cputime_last_delta, smp_processor_id()) = delta; per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled; local_irq_restore(flags); @@ -275,10 +274,8 @@ void account_process_tick(struct task_struct *tsk, int user_tick) utime = get_paca()->user_time; get_paca()->user_time = 0; - account_user_time(tsk, utime); - utimescaled = cputime_to_scaled(utime); - account_user_time_scaled(tsk, utimescaled); + account_user_time(tsk, utime, utimescaled); } /* diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 75a6e62ea97..07283aea2e5 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -50,12 +50,12 @@ void account_process_tick(struct task_struct *tsk, int user_tick) rcu_user_flag = cputime != 0; S390_lowcore.user_timer -= cputime << 12; S390_lowcore.steal_clock -= cputime << 12; - account_user_time(tsk, cputime); + account_user_time(tsk, cputime, cputime); cputime = S390_lowcore.system_timer >> 12; S390_lowcore.system_timer -= cputime << 12; S390_lowcore.steal_clock -= cputime << 12; - account_system_time(tsk, HARDIRQ_OFFSET, cputime); + account_system_time(tsk, HARDIRQ_OFFSET, cputime, cputime); cputime = S390_lowcore.steal_clock; if ((__s64) cputime > 0) { @@ -82,12 +82,12 @@ void account_vtime(struct task_struct *tsk) cputime = S390_lowcore.user_timer >> 12; S390_lowcore.user_timer -= cputime << 12; S390_lowcore.steal_clock -= cputime << 12; - account_user_time(tsk, cputime); + account_user_time(tsk, cputime, cputime); cputime = S390_lowcore.system_timer >> 12; S390_lowcore.system_timer -= cputime << 12; S390_lowcore.steal_clock -= cputime << 12; - account_system_time(tsk, 0, cputime); + account_system_time(tsk, 0, cputime, cputime); } /* @@ -107,7 +107,7 @@ void account_system_vtime(struct task_struct *tsk) cputime = S390_lowcore.system_timer >> 12; S390_lowcore.system_timer -= cputime << 12; S390_lowcore.steal_clock -= cputime << 12; - account_system_time(tsk, 0, cputime); + account_system_time(tsk, 0, cputime, cputime); } EXPORT_SYMBOL_GPL(account_system_vtime); diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 4ee4b3d2316..c78a459662a 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -79,10 +79,8 @@ static inline unsigned int kstat_irqs(unsigned int irq) } extern unsigned long long task_delta_exec(struct task_struct *); -extern void account_user_time(struct task_struct *, cputime_t); -extern void account_user_time_scaled(struct task_struct *, cputime_t); -extern void account_system_time(struct task_struct *, int, cputime_t); -extern void account_system_time_scaled(struct task_struct *, cputime_t); +extern void account_user_time(struct task_struct *, cputime_t, cputime_t); +extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); extern void account_steal_time(struct task_struct *, cputime_t); #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/kernel/sched.c b/kernel/sched.c index fff1c4a20b6..5b03679ff71 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4080,13 +4080,17 @@ unsigned long long task_delta_exec(struct task_struct *p) * Account user cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -void account_user_time(struct task_struct *p, cputime_t cputime) +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; cputime64_t tmp; + /* Add user time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); /* Add user time to cpustat. */ @@ -4103,51 +4107,49 @@ void account_user_time(struct task_struct *p, cputime_t cputime) * Account guest cpu time to a process. * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ -static void account_guest_time(struct task_struct *p, cputime_t cputime) +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) { cputime64_t tmp; struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; tmp = cputime_to_cputime64(cputime); + /* Add guest time to process. */ p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); account_group_user_time(p, cputime); p->gtime = cputime_add(p->gtime, cputime); + /* Add guest time to cpustat. */ cpustat->user = cputime64_add(cpustat->user, tmp); cpustat->guest = cputime64_add(cpustat->guest, tmp); } -/* - * Account scaled user cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in user space since the last update - */ -void account_user_time_scaled(struct task_struct *p, cputime_t cputime) -{ - p->utimescaled = cputime_add(p->utimescaled, cputime); -} - /* * Account system cpu time to a process. * @p: the process that the cpu time gets accounted to * @hardirq_offset: the offset to subtract from hardirq_count() * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency */ void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime) + cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; struct rq *rq = this_rq(); cputime64_t tmp; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { - account_guest_time(p, cputime); + account_guest_time(p, cputime, cputime_scaled); return; } + /* Add system time to process. */ p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); account_group_system_time(p, cputime); /* Add system time to cpustat. */ @@ -4166,17 +4168,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, acct_update_integrals(p); } -/* - * Account scaled system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - */ -void account_system_time_scaled(struct task_struct *p, cputime_t cputime) -{ - p->stimescaled = cputime_add(p->stimescaled, cputime); -} - /* * Account for involuntary wait time. * @p: the process from which the cpu time has been stolen diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 8f3fc2582d3..1f2fce2479f 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -420,6 +420,7 @@ void tick_nohz_restart_sched_tick(void) int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); unsigned long ticks; + cputime_t cputime; ktime_t now; local_irq_disable(); @@ -452,8 +453,8 @@ void tick_nohz_restart_sched_tick(void) */ if (ticks && ticks < LONG_MAX) { add_preempt_count(HARDIRQ_OFFSET); - account_system_time(current, HARDIRQ_OFFSET, - jiffies_to_cputime(ticks)); + cputime = jiffies_to_cputime(ticks); + account_system_time(current, HARDIRQ_OFFSET, cputime, cputime); sub_preempt_count(HARDIRQ_OFFSET); } diff --git a/kernel/timer.c b/kernel/timer.c index 566257d1dc1..b5efb528aa1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1023,13 +1023,11 @@ void account_process_tick(struct task_struct *p, int user_tick) { cputime_t one_jiffy = jiffies_to_cputime(1); - if (user_tick) { - account_user_time(p, one_jiffy); - account_user_time_scaled(p, cputime_to_scaled(one_jiffy)); - } else { - account_system_time(p, HARDIRQ_OFFSET, one_jiffy); - account_system_time_scaled(p, cputime_to_scaled(one_jiffy)); - } + if (user_tick) + account_user_time(p, one_jiffy, cputime_to_scaled(one_jiffy)); + else + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + cputime_to_scaled(one_jiffy)); } #endif From 79741dd35713ff4f6fd0eafd59fa94e8a4ba922d Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 31 Dec 2008 15:11:38 +0100 Subject: [PATCH 2/6] [PATCH] idle cputime accounting The cpu time spent by the idle process actually doing something is currently accounted as idle time. This is plain wrong, the architectures that support VIRT_CPU_ACCOUNTING=y can do better: distinguish between the time spent doing nothing and the time spent by idle doing work. The first is accounted with account_idle_time and the second with account_system_time. The architectures that use the account_xxx_time interface directly and not the account_xxx_ticks interface now need to do the check for the idle process in their arch code. In particular to improve the system vs true idle time accounting the arch code needs to measure the true idle time instead of just testing for the idle process. To improve the tick based accounting as well we would need an architecture primitive that can tell us if the pt_regs of the interrupted context points to the magic instruction that halts the cpu. In addition idle time is no more added to the stime of the idle process. This field now contains the system time of the idle process as it should be. On systems without VIRT_CPU_ACCOUNTING this will always be zero as every tick that occurs while idle is running will be accounted as idle time. This patch contains the necessary common code changes to be able to distinguish idle system time and true idle time. The architectures with support for VIRT_CPU_ACCOUNTING need some changes to exploit this. Signed-off-by: Martin Schwidefsky --- arch/ia64/kernel/time.c | 10 ++++- arch/powerpc/kernel/process.c | 1 + arch/powerpc/kernel/time.c | 13 ++++-- arch/s390/kernel/vtime.c | 20 +++++++-- arch/x86/xen/time.c | 10 ++--- include/linux/kernel_stat.h | 7 ++- include/linux/sched.h | 1 - kernel/sched.c | 80 +++++++++++++++++++++++++++-------- kernel/time/tick-sched.c | 13 +++--- kernel/timer.c | 13 ------ 10 files changed, 114 insertions(+), 54 deletions(-) diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 4ee36781704..f0ebb342409 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -93,7 +93,10 @@ void ia64_account_on_switch(struct task_struct *prev, struct task_struct *next) now = ia64_get_itc(); delta_stime = cycle_to_cputime(pi->ac_stime + (now - pi->ac_stamp)); - account_system_time(prev, 0, delta_stime, delta_stime); + if (idle_task(smp_processor_id()) != prev) + account_system_time(prev, 0, delta_stime, delta_stime); + else + account_idle_time(delta_stime); if (pi->ac_utime) { delta_utime = cycle_to_cputime(pi->ac_utime); @@ -120,7 +123,10 @@ void account_system_vtime(struct task_struct *tsk) now = ia64_get_itc(); delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); - account_system_time(tsk, 0, delta_stime, delta_stime); + if (irq_count() || idle_task(smp_processor_id()) != tsk) + account_system_time(tsk, 0, delta_stime, delta_stime); + else + account_idle_time(delta_stime); ti->ac_stime = 0; ti->ac_stamp = now; diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 51b201ddf9a..fb7049c054c 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 92650ccad2e..3be355c1cfa 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -256,7 +256,10 @@ void account_system_vtime(struct task_struct *tsk) delta += sys_time; get_paca()->system_time = 0; } - account_system_time(tsk, 0, delta, deltascaled); + if (in_irq() || idle_task(smp_processor_id()) != tsk) + account_system_time(tsk, 0, delta, deltascaled); + else + account_idle_time(delta); per_cpu(cputime_last_delta, smp_processor_id()) = delta; per_cpu(cputime_scaled_last_delta, smp_processor_id()) = deltascaled; local_irq_restore(flags); @@ -335,8 +338,12 @@ void calculate_steal_time(void) tb = mftb(); purr = mfspr(SPRN_PURR); stolen = (tb - pme->tb) - (purr - pme->purr); - if (stolen > 0) - account_steal_time(current, stolen); + if (stolen > 0) { + if (idle_task(smp_processor_id()) != current) + account_steal_time(stolen); + else + account_idle_time(stolen); + } pme->tb = tb; pme->purr = purr; } diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 07283aea2e5..4a4a34caec5 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -55,13 +55,19 @@ void account_process_tick(struct task_struct *tsk, int user_tick) cputime = S390_lowcore.system_timer >> 12; S390_lowcore.system_timer -= cputime << 12; S390_lowcore.steal_clock -= cputime << 12; - account_system_time(tsk, HARDIRQ_OFFSET, cputime, cputime); + if (idle_task(smp_processor_id()) != current) + account_system_time(tsk, HARDIRQ_OFFSET, cputime, cputime); + else + account_idle_time(cputime); cputime = S390_lowcore.steal_clock; if ((__s64) cputime > 0) { cputime >>= 12; S390_lowcore.steal_clock -= cputime << 12; - account_steal_time(tsk, cputime); + if (idle_task(smp_processor_id()) != current) + account_steal_time(cputime); + else + account_idle_time(cputime); } } @@ -87,7 +93,10 @@ void account_vtime(struct task_struct *tsk) cputime = S390_lowcore.system_timer >> 12; S390_lowcore.system_timer -= cputime << 12; S390_lowcore.steal_clock -= cputime << 12; - account_system_time(tsk, 0, cputime, cputime); + if (idle_task(smp_processor_id()) != current) + account_system_time(tsk, 0, cputime, cputime); + else + account_idle_time(cputime); } /* @@ -107,7 +116,10 @@ void account_system_vtime(struct task_struct *tsk) cputime = S390_lowcore.system_timer >> 12; S390_lowcore.system_timer -= cputime << 12; S390_lowcore.steal_clock -= cputime << 12; - account_system_time(tsk, 0, cputime, cputime); + if (in_irq() || idle_task(smp_processor_id()) != current) + account_system_time(tsk, 0, cputime, cputime); + else + account_idle_time(cputime); } EXPORT_SYMBOL_GPL(account_system_vtime); diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index c9f7cda48ed..732e52dc991 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -132,8 +132,7 @@ static void do_stolen_accounting(void) *snap = state; /* Add the appropriate number of ticks of stolen time, - including any left-overs from last time. Passing NULL to - account_steal_time accounts the time as stolen. */ + including any left-overs from last time. */ stolen = runnable + offline + __get_cpu_var(residual_stolen); if (stolen < 0) @@ -141,11 +140,10 @@ static void do_stolen_accounting(void) ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); __get_cpu_var(residual_stolen) = stolen; - account_steal_time(NULL, ticks); + account_steal_ticks(ticks); /* Add the appropriate number of ticks of blocked time, - including any left-overs from last time. Passing idle to - account_steal_time accounts the time as idle/wait. */ + including any left-overs from last time. */ blocked += __get_cpu_var(residual_blocked); if (blocked < 0) @@ -153,7 +151,7 @@ static void do_stolen_accounting(void) ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); __get_cpu_var(residual_blocked) = blocked; - account_steal_time(idle_task(smp_processor_id()), ticks); + account_idle_ticks(ticks); } /* diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index c78a459662a..570d2041311 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -81,6 +81,11 @@ static inline unsigned int kstat_irqs(unsigned int irq) extern unsigned long long task_delta_exec(struct task_struct *); extern void account_user_time(struct task_struct *, cputime_t, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t); -extern void account_steal_time(struct task_struct *, cputime_t); +extern void account_steal_time(cputime_t); +extern void account_idle_time(cputime_t); + +extern void account_process_tick(struct task_struct *, int user); +extern void account_steal_ticks(unsigned long ticks); +extern void account_idle_ticks(unsigned long ticks); #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8395e715809..b475d4db805 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -284,7 +284,6 @@ long io_schedule_timeout(long timeout); extern void cpu_init (void); extern void trap_init(void); -extern void account_process_tick(struct task_struct *task, int user); extern void update_process_times(int user); extern void scheduler_tick(void); diff --git a/kernel/sched.c b/kernel/sched.c index 5b03679ff71..635eaffe1e4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4139,7 +4139,6 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cputime_t cputime, cputime_t cputime_scaled) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - struct rq *rq = this_rq(); cputime64_t tmp; if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { @@ -4158,37 +4157,84 @@ void account_system_time(struct task_struct *p, int hardirq_offset, cpustat->irq = cputime64_add(cpustat->irq, tmp); else if (softirq_count()) cpustat->softirq = cputime64_add(cpustat->softirq, tmp); - else if (p != rq->idle) - cpustat->system = cputime64_add(cpustat->system, tmp); - else if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); else - cpustat->idle = cputime64_add(cpustat->idle, tmp); + cpustat->system = cputime64_add(cpustat->system, tmp); + /* Account for system time used */ acct_update_integrals(p); } /* * Account for involuntary wait time. - * @p: the process from which the cpu time has been stolen * @steal: the cpu time spent in involuntary wait */ -void account_steal_time(struct task_struct *p, cputime_t steal) +void account_steal_time(cputime_t cputime) { struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; - cputime64_t tmp = cputime_to_cputime64(steal); + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); struct rq *rq = this_rq(); - if (p == rq->idle) { - p->stime = cputime_add(p->stime, steal); - if (atomic_read(&rq->nr_iowait) > 0) - cpustat->iowait = cputime64_add(cpustat->iowait, tmp); - else - cpustat->idle = cputime64_add(cpustat->idle, tmp); - } else - cpustat->steal = cputime64_add(cpustat->steal, tmp); + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); } +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy = jiffies_to_cputime(1); + cputime_t one_jiffy_scaled = cputime_to_scaled(one_jiffy); + struct rq *rq = this_rq(); + + if (user_tick) + account_user_time(p, one_jiffy, one_jiffy_scaled); + else if (p != rq->idle) + account_system_time(p, HARDIRQ_OFFSET, one_jiffy, + one_jiffy_scaled); + else + account_idle_time(one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + /* * Use precise platform statistics if available: */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1f2fce2479f..611fa4c0baa 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -419,8 +419,9 @@ void tick_nohz_restart_sched_tick(void) { int cpu = smp_processor_id(); struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +#ifndef CONFIG_VIRT_CPU_ACCOUNTING unsigned long ticks; - cputime_t cputime; +#endif ktime_t now; local_irq_disable(); @@ -442,6 +443,7 @@ void tick_nohz_restart_sched_tick(void) tick_do_update_jiffies64(now); cpu_clear(cpu, nohz_cpu_mask); +#ifndef CONFIG_VIRT_CPU_ACCOUNTING /* * We stopped the tick in idle. Update process times would miss the * time we slept as update_process_times does only a 1 tick @@ -451,12 +453,9 @@ void tick_nohz_restart_sched_tick(void) /* * We might be one off. Do not randomly account a huge number of ticks! */ - if (ticks && ticks < LONG_MAX) { - add_preempt_count(HARDIRQ_OFFSET); - cputime = jiffies_to_cputime(ticks); - account_system_time(current, HARDIRQ_OFFSET, cputime, cputime); - sub_preempt_count(HARDIRQ_OFFSET); - } + if (ticks && ticks < LONG_MAX) + account_idle_ticks(ticks); +#endif touch_softlockup_watchdog(); /* diff --git a/kernel/timer.c b/kernel/timer.c index b5efb528aa1..dee3f641a7a 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1018,19 +1018,6 @@ unsigned long get_next_timer_interrupt(unsigned long now) } #endif -#ifndef CONFIG_VIRT_CPU_ACCOUNTING -void account_process_tick(struct task_struct *p, int user_tick) -{ - cputime_t one_jiffy = jiffies_to_cputime(1); - - if (user_tick) - account_user_time(p, one_jiffy, cputime_to_scaled(one_jiffy)); - else - account_system_time(p, HARDIRQ_OFFSET, one_jiffy, - cputime_to_scaled(one_jiffy)); -} -#endif - /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. From aa5e97ce4bbc9d5daeec16b1d15bb3f6b7b4f4d4 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 31 Dec 2008 15:11:39 +0100 Subject: [PATCH 3/6] [PATCH] improve precision of process accounting. The unit of the cputime accouting values that are stored per process is currently a microsecond. The CPU timer has a maximum granularity of 2**-12 microseconds. There is no benefit in storing the per process values in the lesser precision and there is the disadvantage that the backend has to do the rounding to microseconds. The better solution is to use the maximum granularity of the CPU timer as cputime unit. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/cputime.h | 42 ++++++------- arch/s390/include/asm/lowcore.h | 40 ++++++------- arch/s390/include/asm/system.h | 4 +- arch/s390/include/asm/thread_info.h | 2 + arch/s390/kernel/vtime.c | 91 +++++++++++++---------------- 5 files changed, 84 insertions(+), 95 deletions(-) diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 133ce054fc8..521726430af 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -11,7 +11,7 @@ #include -/* We want to use micro-second resolution. */ +/* We want to use full resolution of the CPU timer: 2**-12 micro-seconds. */ typedef unsigned long long cputime_t; typedef unsigned long long cputime64_t; @@ -53,9 +53,9 @@ __div(unsigned long long n, unsigned int base) #define cputime_ge(__a, __b) ((__a) >= (__b)) #define cputime_lt(__a, __b) ((__a) < (__b)) #define cputime_le(__a, __b) ((__a) <= (__b)) -#define cputime_to_jiffies(__ct) (__div((__ct), 1000000 / HZ)) +#define cputime_to_jiffies(__ct) (__div((__ct), 4096000000ULL / HZ)) #define cputime_to_scaled(__ct) (__ct) -#define jiffies_to_cputime(__hz) ((cputime_t)(__hz) * (1000000 / HZ)) +#define jiffies_to_cputime(__hz) ((cputime_t)(__hz) * (4096000000ULL / HZ)) #define cputime64_zero (0ULL) #define cputime64_add(__a, __b) ((__a) + (__b)) @@ -64,7 +64,7 @@ __div(unsigned long long n, unsigned int base) static inline u64 cputime64_to_jiffies64(cputime64_t cputime) { - do_div(cputime, 1000000 / HZ); + do_div(cputime, 4096000000ULL / HZ); return cputime; } @@ -74,13 +74,13 @@ cputime64_to_jiffies64(cputime64_t cputime) static inline unsigned int cputime_to_msecs(const cputime_t cputime) { - return __div(cputime, 1000); + return __div(cputime, 4096000); } static inline cputime_t msecs_to_cputime(const unsigned int m) { - return (cputime_t) m * 1000; + return (cputime_t) m * 4096000; } /* @@ -89,13 +89,13 @@ msecs_to_cputime(const unsigned int m) static inline unsigned int cputime_to_secs(const cputime_t cputime) { - return __div(cputime, 1000000); + return __div(cputime, 2048000000) >> 1; } static inline cputime_t secs_to_cputime(const unsigned int s) { - return (cputime_t) s * 1000000; + return (cputime_t) s * 4096000000ULL; } /* @@ -104,7 +104,7 @@ secs_to_cputime(const unsigned int s) static inline cputime_t timespec_to_cputime(const struct timespec *value) { - return value->tv_nsec / 1000 + (u64) value->tv_sec * 1000000; + return value->tv_nsec * 4096 / 1000 + (u64) value->tv_sec * 4096000000ULL; } static inline void @@ -114,12 +114,12 @@ cputime_to_timespec(const cputime_t cputime, struct timespec *value) register_pair rp; rp.pair = cputime >> 1; - asm ("dr %0,%1" : "+d" (rp) : "d" (1000000 >> 1)); - value->tv_nsec = rp.subreg.even * 1000; + asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL)); + value->tv_nsec = rp.subreg.even * 1000 / 4096; value->tv_sec = rp.subreg.odd; #else - value->tv_nsec = (cputime % 1000000) * 1000; - value->tv_sec = cputime / 1000000; + value->tv_nsec = (cputime % 4096000000ULL) * 1000 / 4096; + value->tv_sec = cputime / 4096000000ULL; #endif } @@ -131,7 +131,7 @@ cputime_to_timespec(const cputime_t cputime, struct timespec *value) static inline cputime_t timeval_to_cputime(const struct timeval *value) { - return value->tv_usec + (u64) value->tv_sec * 1000000; + return value->tv_usec * 4096 + (u64) value->tv_sec * 4096000000ULL; } static inline void @@ -141,12 +141,12 @@ cputime_to_timeval(const cputime_t cputime, struct timeval *value) register_pair rp; rp.pair = cputime >> 1; - asm ("dr %0,%1" : "+d" (rp) : "d" (1000000 >> 1)); - value->tv_usec = rp.subreg.even; + asm ("dr %0,%1" : "+d" (rp) : "d" (2048000000UL)); + value->tv_usec = rp.subreg.even / 4096; value->tv_sec = rp.subreg.odd; #else - value->tv_usec = cputime % 1000000; - value->tv_sec = cputime / 1000000; + value->tv_usec = cputime % 4096000000ULL; + value->tv_sec = cputime / 4096000000ULL; #endif } @@ -156,13 +156,13 @@ cputime_to_timeval(const cputime_t cputime, struct timeval *value) static inline clock_t cputime_to_clock_t(cputime_t cputime) { - return __div(cputime, 1000000 / USER_HZ); + return __div(cputime, 4096000000ULL / USER_HZ); } static inline cputime_t clock_t_to_cputime(unsigned long x) { - return (cputime_t) x * (1000000 / USER_HZ); + return (cputime_t) x * (4096000000ULL / USER_HZ); } /* @@ -171,7 +171,7 @@ clock_t_to_cputime(unsigned long x) static inline clock_t cputime64_to_clock_t(cputime64_t cputime) { - return __div(cputime, 1000000 / USER_HZ); + return __div(cputime, 4096000000ULL / USER_HZ); } #endif /* _S390_CPUTIME_H */ diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 0bc51d52a89..a547817cf1a 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -67,11 +67,11 @@ #define __LC_SYNC_ENTER_TIMER 0x248 #define __LC_ASYNC_ENTER_TIMER 0x250 #define __LC_EXIT_TIMER 0x258 -#define __LC_LAST_UPDATE_TIMER 0x260 -#define __LC_USER_TIMER 0x268 -#define __LC_SYSTEM_TIMER 0x270 -#define __LC_LAST_UPDATE_CLOCK 0x278 -#define __LC_STEAL_CLOCK 0x280 +#define __LC_USER_TIMER 0x260 +#define __LC_SYSTEM_TIMER 0x268 +#define __LC_STEAL_TIMER 0x270 +#define __LC_LAST_UPDATE_TIMER 0x278 +#define __LC_LAST_UPDATE_CLOCK 0x280 #define __LC_RETURN_MCCK_PSW 0x288 #define __LC_KERNEL_STACK 0xC40 #define __LC_THREAD_INFO 0xC44 @@ -89,11 +89,11 @@ #define __LC_SYNC_ENTER_TIMER 0x250 #define __LC_ASYNC_ENTER_TIMER 0x258 #define __LC_EXIT_TIMER 0x260 -#define __LC_LAST_UPDATE_TIMER 0x268 -#define __LC_USER_TIMER 0x270 -#define __LC_SYSTEM_TIMER 0x278 -#define __LC_LAST_UPDATE_CLOCK 0x280 -#define __LC_STEAL_CLOCK 0x288 +#define __LC_USER_TIMER 0x268 +#define __LC_SYSTEM_TIMER 0x270 +#define __LC_STEAL_TIMER 0x278 +#define __LC_LAST_UPDATE_TIMER 0x280 +#define __LC_LAST_UPDATE_CLOCK 0x288 #define __LC_RETURN_MCCK_PSW 0x290 #define __LC_KERNEL_STACK 0xD40 #define __LC_THREAD_INFO 0xD48 @@ -252,11 +252,11 @@ struct _lowcore __u64 sync_enter_timer; /* 0x248 */ __u64 async_enter_timer; /* 0x250 */ __u64 exit_timer; /* 0x258 */ - __u64 last_update_timer; /* 0x260 */ - __u64 user_timer; /* 0x268 */ - __u64 system_timer; /* 0x270 */ - __u64 last_update_clock; /* 0x278 */ - __u64 steal_clock; /* 0x280 */ + __u64 user_timer; /* 0x260 */ + __u64 system_timer; /* 0x268 */ + __u64 steal_timer; /* 0x270 */ + __u64 last_update_timer; /* 0x278 */ + __u64 last_update_clock; /* 0x280 */ psw_t return_mcck_psw; /* 0x288 */ __u8 pad8[0xc00-0x290]; /* 0x290 */ @@ -343,11 +343,11 @@ struct _lowcore __u64 sync_enter_timer; /* 0x250 */ __u64 async_enter_timer; /* 0x258 */ __u64 exit_timer; /* 0x260 */ - __u64 last_update_timer; /* 0x268 */ - __u64 user_timer; /* 0x270 */ - __u64 system_timer; /* 0x278 */ - __u64 last_update_clock; /* 0x280 */ - __u64 steal_clock; /* 0x288 */ + __u64 user_timer; /* 0x268 */ + __u64 system_timer; /* 0x270 */ + __u64 steal_timer; /* 0x278 */ + __u64 last_update_timer; /* 0x280 */ + __u64 last_update_clock; /* 0x288 */ psw_t return_mcck_psw; /* 0x290 */ __u8 pad8[0xc00-0x2a0]; /* 0x2a0 */ /* System info area */ diff --git a/arch/s390/include/asm/system.h b/arch/s390/include/asm/system.h index 024ef42ed6d..3a8b26eb1f2 100644 --- a/arch/s390/include/asm/system.h +++ b/arch/s390/include/asm/system.h @@ -99,7 +99,7 @@ static inline void restore_access_regs(unsigned int *acrs) prev = __switch_to(prev,next); \ } while (0) -extern void account_vtime(struct task_struct *); +extern void account_vtime(struct task_struct *, struct task_struct *); extern void account_tick_vtime(struct task_struct *); extern void account_system_vtime(struct task_struct *); @@ -121,7 +121,7 @@ static inline void cmma_init(void) { } #define finish_arch_switch(prev) do { \ set_fs(current->thread.mm_segment); \ - account_vtime(prev); \ + account_vtime(prev, current); \ } while (0) #define nop() asm volatile("nop") diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index c1eaf9604da..c544aa52453 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -47,6 +47,8 @@ struct thread_info { unsigned int cpu; /* current CPU */ int preempt_count; /* 0 => preemptable, <0 => BUG */ struct restart_block restart_block; + __u64 user_timer; + __u64 system_timer; }; /* diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 4a4a34caec5..1254a4d0d76 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -31,11 +31,10 @@ static DEFINE_PER_CPU(struct vtimer_queue, virt_cpu_timer); * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. */ -void account_process_tick(struct task_struct *tsk, int user_tick) +static void do_account_vtime(struct task_struct *tsk, int hardirq_offset) { - cputime_t cputime; - __u64 timer, clock; - int rcu_user_flag; + struct thread_info *ti = task_thread_info(tsk); + __u64 timer, clock, user, system, steal; timer = S390_lowcore.last_update_timer; clock = S390_lowcore.last_update_clock; @@ -44,59 +43,47 @@ void account_process_tick(struct task_struct *tsk, int user_tick) : "=m" (S390_lowcore.last_update_timer), "=m" (S390_lowcore.last_update_clock) ); S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; - S390_lowcore.steal_clock += S390_lowcore.last_update_clock - clock; + S390_lowcore.steal_timer += S390_lowcore.last_update_clock - clock; - cputime = S390_lowcore.user_timer >> 12; - rcu_user_flag = cputime != 0; - S390_lowcore.user_timer -= cputime << 12; - S390_lowcore.steal_clock -= cputime << 12; - account_user_time(tsk, cputime, cputime); + user = S390_lowcore.user_timer - ti->user_timer; + S390_lowcore.steal_timer -= user; + ti->user_timer = S390_lowcore.user_timer; + account_user_time(tsk, user, user); - cputime = S390_lowcore.system_timer >> 12; - S390_lowcore.system_timer -= cputime << 12; - S390_lowcore.steal_clock -= cputime << 12; + system = S390_lowcore.system_timer - ti->system_timer; + S390_lowcore.steal_timer -= system; + ti->system_timer = S390_lowcore.system_timer; if (idle_task(smp_processor_id()) != current) - account_system_time(tsk, HARDIRQ_OFFSET, cputime, cputime); + account_system_time(tsk, hardirq_offset, system, system); else - account_idle_time(cputime); + account_idle_time(system); - cputime = S390_lowcore.steal_clock; - if ((__s64) cputime > 0) { - cputime >>= 12; - S390_lowcore.steal_clock -= cputime << 12; + steal = S390_lowcore.steal_timer; + if ((s64) steal > 0) { + S390_lowcore.steal_timer = 0; if (idle_task(smp_processor_id()) != current) - account_steal_time(cputime); + account_steal_time(steal); else - account_idle_time(cputime); + account_idle_time(steal); } } -/* - * Update process times based on virtual cpu times stored by entry.S - * to the lowcore fields user_timer, system_timer & steal_clock. - */ -void account_vtime(struct task_struct *tsk) +void account_vtime(struct task_struct *prev, struct task_struct *next) { - cputime_t cputime; - __u64 timer; + struct thread_info *ti; - timer = S390_lowcore.last_update_timer; - asm volatile (" STPT %0" /* Store current cpu timer value */ - : "=m" (S390_lowcore.last_update_timer) ); - S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; + do_account_vtime(prev, 0); + ti = task_thread_info(prev); + ti->user_timer = S390_lowcore.user_timer; + ti->system_timer = S390_lowcore.system_timer; + ti = task_thread_info(next); + S390_lowcore.user_timer = ti->user_timer; + S390_lowcore.system_timer = ti->system_timer; +} - cputime = S390_lowcore.user_timer >> 12; - S390_lowcore.user_timer -= cputime << 12; - S390_lowcore.steal_clock -= cputime << 12; - account_user_time(tsk, cputime, cputime); - - cputime = S390_lowcore.system_timer >> 12; - S390_lowcore.system_timer -= cputime << 12; - S390_lowcore.steal_clock -= cputime << 12; - if (idle_task(smp_processor_id()) != current) - account_system_time(tsk, 0, cputime, cputime); - else - account_idle_time(cputime); +void account_process_tick(struct task_struct *tsk, int user_tick) +{ + do_account_vtime(tsk, HARDIRQ_OFFSET); } /* @@ -105,21 +92,21 @@ void account_vtime(struct task_struct *tsk) */ void account_system_vtime(struct task_struct *tsk) { - cputime_t cputime; - __u64 timer; + struct thread_info *ti = task_thread_info(tsk); + __u64 timer, system; timer = S390_lowcore.last_update_timer; asm volatile (" STPT %0" /* Store current cpu timer value */ : "=m" (S390_lowcore.last_update_timer) ); S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; - cputime = S390_lowcore.system_timer >> 12; - S390_lowcore.system_timer -= cputime << 12; - S390_lowcore.steal_clock -= cputime << 12; + system = S390_lowcore.system_timer - ti->system_timer; + S390_lowcore.steal_timer -= system; + ti->system_timer = S390_lowcore.system_timer; if (in_irq() || idle_task(smp_processor_id()) != current) - account_system_time(tsk, 0, cputime, cputime); + account_system_time(tsk, 0, system, system); else - account_idle_time(cputime); + account_idle_time(system); } EXPORT_SYMBOL_GPL(account_system_vtime); @@ -490,8 +477,8 @@ void init_cpu_vtimer(void) /* kick the virtual timer */ S390_lowcore.exit_timer = VTIMER_MAX_SLICE; S390_lowcore.last_update_timer = VTIMER_MAX_SLICE; - asm volatile ("SPT %0" : : "m" (S390_lowcore.last_update_timer)); asm volatile ("STCK %0" : "=m" (S390_lowcore.last_update_clock)); + asm volatile ("SPT %0" : : "m" (S390_lowcore.last_update_timer)); /* enable cpu timer interrupts */ __ctl_set_bit(0,10); From 6f43092441bda528dd38f2dc6c1e2522c5079fb7 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 31 Dec 2008 15:11:40 +0100 Subject: [PATCH 4/6] [PATCH] improve precision of idle time detection. Increase the precision of the idle time calculation that is exported to user space via /sys/devices/system/cpu/cpu/idle_time_us Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/cpu.h | 3 +- arch/s390/kernel/process.c | 69 ++++++++++++++++++++++++------------- arch/s390/kernel/smp.c | 25 +++++++------- arch/s390/kernel/vtime.c | 3 +- 4 files changed, 62 insertions(+), 38 deletions(-) diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h index e5a6a9ba3ad..89456df43c4 100644 --- a/arch/s390/include/asm/cpu.h +++ b/arch/s390/include/asm/cpu.h @@ -14,7 +14,6 @@ struct s390_idle_data { spinlock_t lock; - unsigned int in_idle; unsigned long long idle_count; unsigned long long idle_enter; unsigned long long idle_time; @@ -26,7 +25,7 @@ void s390_idle_leave(void); static inline void s390_idle_check(void) { - if ((&__get_cpu_var(s390_idle))->in_idle) + if ((&__get_cpu_var(s390_idle))->idle_enter != 0ULL) s390_idle_leave(); } diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 04f8c67a610..1e06436f07c 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -79,30 +80,19 @@ DEFINE_PER_CPU(struct s390_idle_data, s390_idle) = { .lock = __SPIN_LOCK_UNLOCKED(s390_idle.lock) }; -static int s390_idle_enter(void) -{ - struct s390_idle_data *idle; - - idle = &__get_cpu_var(s390_idle); - spin_lock(&idle->lock); - idle->idle_count++; - idle->in_idle = 1; - idle->idle_enter = get_clock(); - spin_unlock(&idle->lock); - vtime_stop_cpu_timer(); - return NOTIFY_OK; -} - void s390_idle_leave(void) { struct s390_idle_data *idle; + unsigned long long idle_time; - vtime_start_cpu_timer(); idle = &__get_cpu_var(s390_idle); + idle_time = S390_lowcore.int_clock - idle->idle_enter; spin_lock(&idle->lock); - idle->idle_time += get_clock() - idle->idle_enter; - idle->in_idle = 0; + idle->idle_time += idle_time; + idle->idle_enter = 0ULL; + idle->idle_count++; spin_unlock(&idle->lock); + vtime_start_cpu_timer(); } extern void s390_handle_mcck(void); @@ -111,16 +101,16 @@ extern void s390_handle_mcck(void); */ static void default_idle(void) { + struct s390_idle_data *idle = &__get_cpu_var(s390_idle); + unsigned long addr; + psw_t psw; + /* CPU is going idle. */ local_irq_disable(); if (need_resched()) { local_irq_enable(); return; } - if (s390_idle_enter() == NOTIFY_BAD) { - local_irq_enable(); - return; - } #ifdef CONFIG_HOTPLUG_CPU if (cpu_is_offline(smp_processor_id())) { preempt_enable_no_resched(); @@ -138,9 +128,42 @@ static void default_idle(void) trace_hardirqs_on(); /* Don't trace preempt off for idle. */ stop_critical_timings(); + vtime_stop_cpu_timer(); + + /* + * The inline assembly is equivalent to + * idle->idle_enter = get_clock(); + * __load_psw_mask(psw_kernel_bits | PSW_MASK_WAIT | + * PSW_MASK_IO | PSW_MASK_EXT); + * The difference is that the inline assembly makes sure that + * the stck instruction is right before the lpsw instruction. + * This is done to increase the precision. + */ + /* Wait for external, I/O or machine check interrupt. */ - __load_psw_mask(psw_kernel_bits | PSW_MASK_WAIT | - PSW_MASK_IO | PSW_MASK_EXT); + psw.mask = psw_kernel_bits|PSW_MASK_WAIT|PSW_MASK_IO|PSW_MASK_EXT; +#ifndef __s390x__ + asm volatile( + " basr %0,0\n" + "0: ahi %0,1f-0b\n" + " st %0,4(%2)\n" + " stck 0(%3)\n" + " lpsw 0(%2)\n" + "1:" + : "=&d" (addr), "=m" (idle->idle_enter) + : "a" (&psw), "a" (&idle->idle_enter), "m" (psw) + : "memory", "cc"); +#else /* __s390x__ */ + asm volatile( + " larl %0,1f\n" + " stg %0,8(%2)\n" + " stck 0(%3)\n" + " lpswe 0(%2)\n" + "1:" + : "=&d" (addr), "=m" (idle->idle_enter) + : "a" (&psw), "a" (&idle->idle_enter), "m" (psw) + : "memory", "cc"); +#endif /* __s390x__ */ start_critical_timings(); } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 6fc78541dc5..3979a6fc088 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -851,9 +851,11 @@ static ssize_t show_idle_count(struct sys_device *dev, unsigned long long idle_count; idle = &per_cpu(s390_idle, dev->id); - spin_lock_irq(&idle->lock); + spin_lock(&idle->lock); idle_count = idle->idle_count; - spin_unlock_irq(&idle->lock); + if (idle->idle_enter) + idle_count++; + spin_unlock(&idle->lock); return sprintf(buf, "%llu\n", idle_count); } static SYSDEV_ATTR(idle_count, 0444, show_idle_count, NULL); @@ -862,18 +864,17 @@ static ssize_t show_idle_time(struct sys_device *dev, struct sysdev_attribute *attr, char *buf) { struct s390_idle_data *idle; - unsigned long long new_time; + unsigned long long now, idle_time, idle_enter; idle = &per_cpu(s390_idle, dev->id); - spin_lock_irq(&idle->lock); - if (idle->in_idle) { - new_time = get_clock(); - idle->idle_time += new_time - idle->idle_enter; - idle->idle_enter = new_time; - } - new_time = idle->idle_time; - spin_unlock_irq(&idle->lock); - return sprintf(buf, "%llu\n", new_time >> 12); + spin_lock(&idle->lock); + now = get_clock(); + idle_time = idle->idle_time; + idle_enter = idle->idle_enter; + if (idle_enter != 0ULL && idle_enter < now) + idle_time += now - idle_enter; + spin_unlock(&idle->lock); + return sprintf(buf, "%llu\n", idle_time >> 12); } static SYSDEV_ATTR(idle_time_us, 0444, show_idle_time, NULL); diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 1254a4d0d76..25d21fef76b 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -112,6 +112,7 @@ EXPORT_SYMBOL_GPL(account_system_vtime); static inline void set_vtimer(__u64 expires) { + struct vtimer_queue *vq = &__get_cpu_var(virt_cpu_timer); __u64 timer; asm volatile (" STPT %0\n" /* Store current cpu timer value */ @@ -121,7 +122,7 @@ static inline void set_vtimer(__u64 expires) S390_lowcore.last_update_timer = expires; /* store expire time for this CPU timer */ - __get_cpu_var(virt_cpu_timer).to_expire = expires; + vq->to_expire = expires; } void vtime_start_cpu_timer(void) From 9cfb9b3c3a7361c793c031e9c3583b177ac5debd Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 31 Dec 2008 15:11:41 +0100 Subject: [PATCH 5/6] [PATCH] improve idle cputime accounting Distinguish the cputime of the idle process where idle is actually using cpu cycles from the cputime where idle is sleeping on an enabled wait psw. The former is accounted as system time, the later as idle time. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/cpu.h | 4 +- arch/s390/include/asm/timer.h | 16 +- arch/s390/kernel/entry.S | 5 +- arch/s390/kernel/entry64.S | 5 +- arch/s390/kernel/process.c | 64 +----- arch/s390/kernel/s390_ext.c | 2 +- arch/s390/kernel/vtime.c | 404 +++++++++++++++++++--------------- drivers/s390/cio/cio.c | 2 +- drivers/s390/s390mach.c | 3 + 9 files changed, 244 insertions(+), 261 deletions(-) diff --git a/arch/s390/include/asm/cpu.h b/arch/s390/include/asm/cpu.h index 89456df43c4..d60a2eefb17 100644 --- a/arch/s390/include/asm/cpu.h +++ b/arch/s390/include/asm/cpu.h @@ -21,12 +21,12 @@ struct s390_idle_data { DECLARE_PER_CPU(struct s390_idle_data, s390_idle); -void s390_idle_leave(void); +void vtime_start_cpu(void); static inline void s390_idle_check(void) { if ((&__get_cpu_var(s390_idle))->idle_enter != 0ULL) - s390_idle_leave(); + vtime_start_cpu(); } #endif /* _ASM_S390_CPU_H_ */ diff --git a/arch/s390/include/asm/timer.h b/arch/s390/include/asm/timer.h index 61705d60f99..e4bcab739c1 100644 --- a/arch/s390/include/asm/timer.h +++ b/arch/s390/include/asm/timer.h @@ -23,20 +23,18 @@ struct vtimer_list { __u64 expires; __u64 interval; - spinlock_t lock; - unsigned long magic; - void (*function)(unsigned long); unsigned long data; }; -/* the offset value will wrap after ca. 71 years */ +/* the vtimer value will wrap after ca. 71 years */ struct vtimer_queue { struct list_head list; spinlock_t lock; - __u64 to_expire; /* current event expire time */ - __u64 offset; /* list offset to zero */ - __u64 idle; /* temp var for idle */ + __u64 timer; /* last programmed timer */ + __u64 elapsed; /* elapsed time of timer expire values */ + __u64 idle; /* temp var for idle */ + int do_spt; /* =1: reprogram cpu timer in idle */ }; extern void init_virt_timer(struct vtimer_list *timer); @@ -48,8 +46,8 @@ extern int del_virt_timer(struct vtimer_list *timer); extern void init_cpu_vtimer(void); extern void vtime_init(void); -extern void vtime_start_cpu_timer(void); -extern void vtime_stop_cpu_timer(void); +extern void vtime_stop_cpu(void); +extern void vtime_start_leave(void); #endif /* __KERNEL__ */ diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 55de521aef7..1268aa2991b 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -583,8 +583,8 @@ kernel_per: .globl io_int_handler io_int_handler: - stpt __LC_ASYNC_ENTER_TIMER stck __LC_INT_CLOCK + stpt __LC_ASYNC_ENTER_TIMER SAVE_ALL_BASE __LC_SAVE_AREA+16 SAVE_ALL_ASYNC __LC_IO_OLD_PSW,__LC_SAVE_AREA+16 CREATE_STACK_FRAME __LC_IO_OLD_PSW,__LC_SAVE_AREA+16 @@ -723,8 +723,8 @@ io_notify_resume: .globl ext_int_handler ext_int_handler: - stpt __LC_ASYNC_ENTER_TIMER stck __LC_INT_CLOCK + stpt __LC_ASYNC_ENTER_TIMER SAVE_ALL_BASE __LC_SAVE_AREA+16 SAVE_ALL_ASYNC __LC_EXT_OLD_PSW,__LC_SAVE_AREA+16 CREATE_STACK_FRAME __LC_EXT_OLD_PSW,__LC_SAVE_AREA+16 @@ -750,6 +750,7 @@ __critical_end: .globl mcck_int_handler mcck_int_handler: + stck __LC_INT_CLOCK spt __LC_CPU_TIMER_SAVE_AREA # revalidate cpu timer lm %r0,%r15,__LC_GPREGS_SAVE_AREA # revalidate gprs SAVE_ALL_BASE __LC_SAVE_AREA+32 diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index 16bb4fd1a40..ae83c195171 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -559,8 +559,8 @@ kernel_per: */ .globl io_int_handler io_int_handler: - stpt __LC_ASYNC_ENTER_TIMER stck __LC_INT_CLOCK + stpt __LC_ASYNC_ENTER_TIMER SAVE_ALL_BASE __LC_SAVE_AREA+32 SAVE_ALL_ASYNC __LC_IO_OLD_PSW,__LC_SAVE_AREA+32 CREATE_STACK_FRAME __LC_IO_OLD_PSW,__LC_SAVE_AREA+32 @@ -721,8 +721,8 @@ io_notify_resume: */ .globl ext_int_handler ext_int_handler: - stpt __LC_ASYNC_ENTER_TIMER stck __LC_INT_CLOCK + stpt __LC_ASYNC_ENTER_TIMER SAVE_ALL_BASE __LC_SAVE_AREA+32 SAVE_ALL_ASYNC __LC_EXT_OLD_PSW,__LC_SAVE_AREA+32 CREATE_STACK_FRAME __LC_EXT_OLD_PSW,__LC_SAVE_AREA+32 @@ -746,6 +746,7 @@ __critical_end: */ .globl mcck_int_handler mcck_int_handler: + stck __LC_INT_CLOCK la %r1,4095 # revalidate r1 spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # revalidate cpu timer lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 1e06436f07c..b6110bdf8dc 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -46,7 +46,6 @@ #include #include #include -#include #include "entry.h" asmlinkage void ret_from_fork(void) asm ("ret_from_fork"); @@ -76,35 +75,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return sf->gprs[8]; } -DEFINE_PER_CPU(struct s390_idle_data, s390_idle) = { - .lock = __SPIN_LOCK_UNLOCKED(s390_idle.lock) -}; - -void s390_idle_leave(void) -{ - struct s390_idle_data *idle; - unsigned long long idle_time; - - idle = &__get_cpu_var(s390_idle); - idle_time = S390_lowcore.int_clock - idle->idle_enter; - spin_lock(&idle->lock); - idle->idle_time += idle_time; - idle->idle_enter = 0ULL; - idle->idle_count++; - spin_unlock(&idle->lock); - vtime_start_cpu_timer(); -} - extern void s390_handle_mcck(void); /* * The idle loop on a S390... */ static void default_idle(void) { - struct s390_idle_data *idle = &__get_cpu_var(s390_idle); - unsigned long addr; - psw_t psw; - /* CPU is going idle. */ local_irq_disable(); if (need_resched()) { @@ -120,7 +96,6 @@ static void default_idle(void) local_mcck_disable(); if (test_thread_flag(TIF_MCCK_PENDING)) { local_mcck_enable(); - s390_idle_leave(); local_irq_enable(); s390_handle_mcck(); return; @@ -128,42 +103,9 @@ static void default_idle(void) trace_hardirqs_on(); /* Don't trace preempt off for idle. */ stop_critical_timings(); - vtime_stop_cpu_timer(); - - /* - * The inline assembly is equivalent to - * idle->idle_enter = get_clock(); - * __load_psw_mask(psw_kernel_bits | PSW_MASK_WAIT | - * PSW_MASK_IO | PSW_MASK_EXT); - * The difference is that the inline assembly makes sure that - * the stck instruction is right before the lpsw instruction. - * This is done to increase the precision. - */ - - /* Wait for external, I/O or machine check interrupt. */ - psw.mask = psw_kernel_bits|PSW_MASK_WAIT|PSW_MASK_IO|PSW_MASK_EXT; -#ifndef __s390x__ - asm volatile( - " basr %0,0\n" - "0: ahi %0,1f-0b\n" - " st %0,4(%2)\n" - " stck 0(%3)\n" - " lpsw 0(%2)\n" - "1:" - : "=&d" (addr), "=m" (idle->idle_enter) - : "a" (&psw), "a" (&idle->idle_enter), "m" (psw) - : "memory", "cc"); -#else /* __s390x__ */ - asm volatile( - " larl %0,1f\n" - " stg %0,8(%2)\n" - " stck 0(%3)\n" - " lpswe 0(%2)\n" - "1:" - : "=&d" (addr), "=m" (idle->idle_enter) - : "a" (&psw), "a" (&idle->idle_enter), "m" (psw) - : "memory", "cc"); -#endif /* __s390x__ */ + /* Stop virtual timer and halt the cpu. */ + vtime_stop_cpu(); + /* Reenable preemption tracer. */ start_critical_timings(); } diff --git a/arch/s390/kernel/s390_ext.c b/arch/s390/kernel/s390_ext.c index e019b419efc..a0d2d55d7fb 100644 --- a/arch/s390/kernel/s390_ext.c +++ b/arch/s390/kernel/s390_ext.c @@ -119,8 +119,8 @@ void do_extint(struct pt_regs *regs, unsigned short code) struct pt_regs *old_regs; old_regs = set_irq_regs(regs); - irq_enter(); s390_idle_check(); + irq_enter(); if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator) /* Serve timer interrupts first. */ clock_comparator_work(); diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 25d21fef76b..2fb36e46219 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -23,10 +23,35 @@ #include #include #include +#include static ext_int_info_t ext_int_info_timer; + static DEFINE_PER_CPU(struct vtimer_queue, virt_cpu_timer); +DEFINE_PER_CPU(struct s390_idle_data, s390_idle) = { + .lock = __SPIN_LOCK_UNLOCKED(s390_idle.lock) +}; + +static inline __u64 get_vtimer(void) +{ + __u64 timer; + + asm volatile("STPT %0" : "=m" (timer)); + return timer; +} + +static inline void set_vtimer(__u64 expires) +{ + __u64 timer; + + asm volatile (" STPT %0\n" /* Store current cpu timer value */ + " SPT %1" /* Set new value immediatly afterwards */ + : "=m" (timer) : "m" (expires) ); + S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer; + S390_lowcore.last_update_timer = expires; +} + /* * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. @@ -53,18 +78,12 @@ static void do_account_vtime(struct task_struct *tsk, int hardirq_offset) system = S390_lowcore.system_timer - ti->system_timer; S390_lowcore.steal_timer -= system; ti->system_timer = S390_lowcore.system_timer; - if (idle_task(smp_processor_id()) != current) - account_system_time(tsk, hardirq_offset, system, system); - else - account_idle_time(system); + account_system_time(tsk, hardirq_offset, system, system); steal = S390_lowcore.steal_timer; if ((s64) steal > 0) { S390_lowcore.steal_timer = 0; - if (idle_task(smp_processor_id()) != current) - account_steal_time(steal); - else - account_idle_time(steal); + account_steal_time(steal); } } @@ -96,80 +115,127 @@ void account_system_vtime(struct task_struct *tsk) __u64 timer, system; timer = S390_lowcore.last_update_timer; - asm volatile (" STPT %0" /* Store current cpu timer value */ - : "=m" (S390_lowcore.last_update_timer) ); + S390_lowcore.last_update_timer = get_vtimer(); S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; system = S390_lowcore.system_timer - ti->system_timer; S390_lowcore.steal_timer -= system; ti->system_timer = S390_lowcore.system_timer; - if (in_irq() || idle_task(smp_processor_id()) != current) - account_system_time(tsk, 0, system, system); - else - account_idle_time(system); + account_system_time(tsk, 0, system, system); } EXPORT_SYMBOL_GPL(account_system_vtime); -static inline void set_vtimer(__u64 expires) +void vtime_start_cpu(void) { + struct s390_idle_data *idle = &__get_cpu_var(s390_idle); struct vtimer_queue *vq = &__get_cpu_var(virt_cpu_timer); - __u64 timer; + __u64 idle_time, expires; - asm volatile (" STPT %0\n" /* Store current cpu timer value */ - " SPT %1" /* Set new value immediatly afterwards */ - : "=m" (timer) : "m" (expires) ); - S390_lowcore.system_timer += S390_lowcore.last_update_timer - timer; - S390_lowcore.last_update_timer = expires; + /* Account time spent with enabled wait psw loaded as idle time. */ + idle_time = S390_lowcore.int_clock - idle->idle_enter; + account_idle_time(idle_time); + S390_lowcore.last_update_clock = S390_lowcore.int_clock; - /* store expire time for this CPU timer */ - vq->to_expire = expires; -} + /* Account system time spent going idle. */ + S390_lowcore.system_timer += S390_lowcore.last_update_timer - vq->idle; + S390_lowcore.last_update_timer = S390_lowcore.async_enter_timer; -void vtime_start_cpu_timer(void) -{ - struct vtimer_queue *vt_list; - - vt_list = &__get_cpu_var(virt_cpu_timer); - - /* CPU timer interrupt is pending, don't reprogramm it */ - if (vt_list->idle & 1LL<<63) - return; - - if (!list_empty(&vt_list->list)) - set_vtimer(vt_list->idle); -} - -void vtime_stop_cpu_timer(void) -{ - struct vtimer_queue *vt_list; - - vt_list = &__get_cpu_var(virt_cpu_timer); - - /* nothing to do */ - if (list_empty(&vt_list->list)) { - vt_list->idle = VTIMER_MAX_SLICE; - goto fire; + /* Restart vtime CPU timer */ + if (vq->do_spt) { + /* Program old expire value but first save progress. */ + expires = vq->idle - S390_lowcore.async_enter_timer; + expires += get_vtimer(); + set_vtimer(expires); + } else { + /* Don't account the CPU timer delta while the cpu was idle. */ + vq->elapsed -= vq->idle - S390_lowcore.async_enter_timer; } - /* store the actual expire value */ - asm volatile ("STPT %0" : "=m" (vt_list->idle)); + spin_lock(&idle->lock); + idle->idle_time += idle_time; + idle->idle_enter = 0ULL; + idle->idle_count++; + spin_unlock(&idle->lock); +} - /* - * If the CPU timer is negative we don't reprogramm - * it because we will get instantly an interrupt. - */ - if (vt_list->idle & 1LL<<63) - return; +void vtime_stop_cpu(void) +{ + struct s390_idle_data *idle = &__get_cpu_var(s390_idle); + struct vtimer_queue *vq = &__get_cpu_var(virt_cpu_timer); + psw_t psw; - vt_list->offset += vt_list->to_expire - vt_list->idle; + /* Wait for external, I/O or machine check interrupt. */ + psw.mask = psw_kernel_bits | PSW_MASK_WAIT | PSW_MASK_IO | PSW_MASK_EXT; - /* - * We cannot halt the CPU timer, we just write a value that - * nearly never expires (only after 71 years) and re-write - * the stored expire value if we continue the timer - */ - fire: - set_vtimer(VTIMER_MAX_SLICE); + /* Check if the CPU timer needs to be reprogrammed. */ + if (vq->do_spt) { + __u64 vmax = VTIMER_MAX_SLICE; + /* + * The inline assembly is equivalent to + * vq->idle = get_cpu_timer(); + * set_cpu_timer(VTIMER_MAX_SLICE); + * idle->idle_enter = get_clock(); + * __load_psw_mask(psw_kernel_bits | PSW_MASK_WAIT | + * PSW_MASK_IO | PSW_MASK_EXT); + * The difference is that the inline assembly makes sure that + * the last three instruction are stpt, stck and lpsw in that + * order. This is done to increase the precision. + */ + asm volatile( +#ifndef CONFIG_64BIT + " basr 1,0\n" + "0: ahi 1,1f-0b\n" + " st 1,4(%2)\n" +#else /* CONFIG_64BIT */ + " larl 1,1f\n" + " stg 1,8(%2)\n" +#endif /* CONFIG_64BIT */ + " stpt 0(%4)\n" + " spt 0(%5)\n" + " stck 0(%3)\n" +#ifndef CONFIG_64BIT + " lpsw 0(%2)\n" +#else /* CONFIG_64BIT */ + " lpswe 0(%2)\n" +#endif /* CONFIG_64BIT */ + "1:" + : "=m" (idle->idle_enter), "=m" (vq->idle) + : "a" (&psw), "a" (&idle->idle_enter), + "a" (&vq->idle), "a" (&vmax), "m" (vmax), "m" (psw) + : "memory", "cc", "1"); + } else { + /* + * The inline assembly is equivalent to + * vq->idle = get_cpu_timer(); + * idle->idle_enter = get_clock(); + * __load_psw_mask(psw_kernel_bits | PSW_MASK_WAIT | + * PSW_MASK_IO | PSW_MASK_EXT); + * The difference is that the inline assembly makes sure that + * the last three instruction are stpt, stck and lpsw in that + * order. This is done to increase the precision. + */ + asm volatile( +#ifndef CONFIG_64BIT + " basr 1,0\n" + "0: ahi 1,1f-0b\n" + " st 1,4(%2)\n" +#else /* CONFIG_64BIT */ + " larl 1,1f\n" + " stg 1,8(%2)\n" +#endif /* CONFIG_64BIT */ + " stpt 0(%4)\n" + " stck 0(%3)\n" +#ifndef CONFIG_64BIT + " lpsw 0(%2)\n" +#else /* CONFIG_64BIT */ + " lpswe 0(%2)\n" +#endif /* CONFIG_64BIT */ + "1:" + : "=m" (idle->idle_enter), "=m" (vq->idle) + : "a" (&psw), "a" (&idle->idle_enter), + "a" (&vq->idle), "m" (psw) + : "memory", "cc", "1"); + } } /* @@ -195,30 +261,23 @@ static void list_add_sorted(struct vtimer_list *timer, struct list_head *head) */ static void do_callbacks(struct list_head *cb_list) { - struct vtimer_queue *vt_list; + struct vtimer_queue *vq; struct vtimer_list *event, *tmp; - void (*fn)(unsigned long); - unsigned long data; if (list_empty(cb_list)) return; - vt_list = &__get_cpu_var(virt_cpu_timer); + vq = &__get_cpu_var(virt_cpu_timer); list_for_each_entry_safe(event, tmp, cb_list, entry) { - fn = event->function; - data = event->data; - fn(data); - - if (!event->interval) - /* delete one shot timer */ - list_del_init(&event->entry); - else { - /* move interval timer back to list */ - spin_lock(&vt_list->lock); - list_del_init(&event->entry); - list_add_sorted(event, &vt_list->list); - spin_unlock(&vt_list->lock); + list_del_init(&event->entry); + (event->function)(event->data); + if (event->interval) { + /* Recharge interval timer */ + event->expires = event->interval + vq->elapsed; + spin_lock(&vq->lock); + list_add_sorted(event, &vq->list); + spin_unlock(&vq->lock); } } } @@ -228,64 +287,57 @@ static void do_callbacks(struct list_head *cb_list) */ static void do_cpu_timer_interrupt(__u16 error_code) { - __u64 next, delta; - struct vtimer_queue *vt_list; + struct vtimer_queue *vq; struct vtimer_list *event, *tmp; - struct list_head *ptr; - /* the callback queue */ - struct list_head cb_list; + struct list_head cb_list; /* the callback queue */ + __u64 elapsed, next; INIT_LIST_HEAD(&cb_list); - vt_list = &__get_cpu_var(virt_cpu_timer); + vq = &__get_cpu_var(virt_cpu_timer); /* walk timer list, fire all expired events */ - spin_lock(&vt_list->lock); + spin_lock(&vq->lock); - if (vt_list->to_expire < VTIMER_MAX_SLICE) - vt_list->offset += vt_list->to_expire; - - list_for_each_entry_safe(event, tmp, &vt_list->list, entry) { - if (event->expires > vt_list->offset) - /* found first unexpired event, leave */ - break; - - /* re-charge interval timer, we have to add the offset */ - if (event->interval) - event->expires = event->interval + vt_list->offset; - - /* move expired timer to the callback queue */ - list_move_tail(&event->entry, &cb_list); + elapsed = vq->elapsed + (vq->timer - S390_lowcore.async_enter_timer); + BUG_ON((s64) elapsed < 0); + vq->elapsed = 0; + list_for_each_entry_safe(event, tmp, &vq->list, entry) { + if (event->expires < elapsed) + /* move expired timer to the callback queue */ + list_move_tail(&event->entry, &cb_list); + else + event->expires -= elapsed; } - spin_unlock(&vt_list->lock); + spin_unlock(&vq->lock); + + vq->do_spt = list_empty(&cb_list); do_callbacks(&cb_list); /* next event is first in list */ - spin_lock(&vt_list->lock); - if (!list_empty(&vt_list->list)) { - ptr = vt_list->list.next; - event = list_entry(ptr, struct vtimer_list, entry); - next = event->expires - vt_list->offset; - - /* add the expired time from this interrupt handler - * and the callback functions - */ - asm volatile ("STPT %0" : "=m" (delta)); - delta = 0xffffffffffffffffLL - delta + 1; - vt_list->offset += delta; - next -= delta; - } else { - vt_list->offset = 0; - next = VTIMER_MAX_SLICE; - } - spin_unlock(&vt_list->lock); - set_vtimer(next); + next = VTIMER_MAX_SLICE; + spin_lock(&vq->lock); + if (!list_empty(&vq->list)) { + event = list_first_entry(&vq->list, struct vtimer_list, entry); + next = event->expires; + } else + vq->do_spt = 0; + spin_unlock(&vq->lock); + /* + * To improve precision add the time spent by the + * interrupt handler to the elapsed time. + * Note: CPU timer counts down and we got an interrupt, + * the current content is negative + */ + elapsed = S390_lowcore.async_enter_timer - get_vtimer(); + set_vtimer(next - elapsed); + vq->timer = next - elapsed; + vq->elapsed = elapsed; } void init_virt_timer(struct vtimer_list *timer) { timer->function = NULL; INIT_LIST_HEAD(&timer->entry); - spin_lock_init(&timer->lock); } EXPORT_SYMBOL(init_virt_timer); @@ -299,44 +351,40 @@ static inline int vtimer_pending(struct vtimer_list *timer) */ static void internal_add_vtimer(struct vtimer_list *timer) { + struct vtimer_queue *vq; unsigned long flags; - __u64 done; - struct vtimer_list *event; - struct vtimer_queue *vt_list; + __u64 left, expires; - vt_list = &per_cpu(virt_cpu_timer, timer->cpu); - spin_lock_irqsave(&vt_list->lock, flags); + vq = &per_cpu(virt_cpu_timer, timer->cpu); + spin_lock_irqsave(&vq->lock, flags); BUG_ON(timer->cpu != smp_processor_id()); - /* if list is empty we only have to set the timer */ - if (list_empty(&vt_list->list)) { - /* reset the offset, this may happen if the last timer was - * just deleted by mod_virt_timer and the interrupt - * didn't happen until here - */ - vt_list->offset = 0; - goto fire; + if (list_empty(&vq->list)) { + /* First timer on this cpu, just program it. */ + list_add(&timer->entry, &vq->list); + set_vtimer(timer->expires); + vq->timer = timer->expires; + vq->elapsed = 0; + } else { + /* Check progress of old timers. */ + expires = timer->expires; + left = get_vtimer(); + if (likely((s64) expires < (s64) left)) { + /* The new timer expires before the current timer. */ + set_vtimer(expires); + vq->elapsed += vq->timer - left; + vq->timer = expires; + } else { + vq->elapsed += vq->timer - left; + vq->timer = left; + } + /* Insert new timer into per cpu list. */ + timer->expires += vq->elapsed; + list_add_sorted(timer, &vq->list); } - /* save progress */ - asm volatile ("STPT %0" : "=m" (done)); - - /* calculate completed work */ - done = vt_list->to_expire - done + vt_list->offset; - vt_list->offset = 0; - - list_for_each_entry(event, &vt_list->list, entry) - event->expires -= done; - - fire: - list_add_sorted(timer, &vt_list->list); - - /* get first element, which is the next vtimer slice */ - event = list_entry(vt_list->list.next, struct vtimer_list, entry); - - set_vtimer(event->expires); - spin_unlock_irqrestore(&vt_list->lock, flags); + spin_unlock_irqrestore(&vq->lock, flags); /* release CPU acquired in prepare_vtimer or mod_virt_timer() */ put_cpu(); } @@ -381,14 +429,15 @@ EXPORT_SYMBOL(add_virt_timer_periodic); * If we change a pending timer the function must be called on the CPU * where the timer is running on, e.g. by smp_call_function_single() * - * The original mod_timer adds the timer if it is not pending. For compatibility - * we do the same. The timer will be added on the current CPU as a oneshot timer. + * The original mod_timer adds the timer if it is not pending. For + * compatibility we do the same. The timer will be added on the current + * CPU as a oneshot timer. * * returns whether it has modified a pending timer (1) or not (0) */ int mod_virt_timer(struct vtimer_list *timer, __u64 expires) { - struct vtimer_queue *vt_list; + struct vtimer_queue *vq; unsigned long flags; int cpu; @@ -404,17 +453,17 @@ int mod_virt_timer(struct vtimer_list *timer, __u64 expires) return 1; cpu = get_cpu(); - vt_list = &per_cpu(virt_cpu_timer, cpu); + vq = &per_cpu(virt_cpu_timer, cpu); /* check if we run on the right CPU */ BUG_ON(timer->cpu != cpu); /* disable interrupts before test if timer is pending */ - spin_lock_irqsave(&vt_list->lock, flags); + spin_lock_irqsave(&vq->lock, flags); /* if timer isn't pending add it on the current CPU */ if (!vtimer_pending(timer)) { - spin_unlock_irqrestore(&vt_list->lock, flags); + spin_unlock_irqrestore(&vq->lock, flags); /* we do not activate an interval timer with mod_virt_timer */ timer->interval = 0; timer->expires = expires; @@ -431,7 +480,7 @@ int mod_virt_timer(struct vtimer_list *timer, __u64 expires) timer->interval = expires; /* the timer can't expire anymore so we can release the lock */ - spin_unlock_irqrestore(&vt_list->lock, flags); + spin_unlock_irqrestore(&vq->lock, flags); internal_add_vtimer(timer); return 1; } @@ -445,25 +494,19 @@ EXPORT_SYMBOL(mod_virt_timer); int del_virt_timer(struct vtimer_list *timer) { unsigned long flags; - struct vtimer_queue *vt_list; + struct vtimer_queue *vq; /* check if timer is pending */ if (!vtimer_pending(timer)) return 0; - vt_list = &per_cpu(virt_cpu_timer, timer->cpu); - spin_lock_irqsave(&vt_list->lock, flags); + vq = &per_cpu(virt_cpu_timer, timer->cpu); + spin_lock_irqsave(&vq->lock, flags); /* we don't interrupt a running timer, just let it expire! */ list_del_init(&timer->entry); - /* last timer removed */ - if (list_empty(&vt_list->list)) { - vt_list->to_expire = 0; - vt_list->offset = 0; - } - - spin_unlock_irqrestore(&vt_list->lock, flags); + spin_unlock_irqrestore(&vq->lock, flags); return 1; } EXPORT_SYMBOL(del_virt_timer); @@ -473,24 +516,19 @@ EXPORT_SYMBOL(del_virt_timer); */ void init_cpu_vtimer(void) { - struct vtimer_queue *vt_list; + struct vtimer_queue *vq; /* kick the virtual timer */ - S390_lowcore.exit_timer = VTIMER_MAX_SLICE; - S390_lowcore.last_update_timer = VTIMER_MAX_SLICE; asm volatile ("STCK %0" : "=m" (S390_lowcore.last_update_clock)); - asm volatile ("SPT %0" : : "m" (S390_lowcore.last_update_timer)); + asm volatile ("STPT %0" : "=m" (S390_lowcore.last_update_timer)); + + /* initialize per cpu vtimer structure */ + vq = &__get_cpu_var(virt_cpu_timer); + INIT_LIST_HEAD(&vq->list); + spin_lock_init(&vq->lock); /* enable cpu timer interrupts */ __ctl_set_bit(0,10); - - vt_list = &__get_cpu_var(virt_cpu_timer); - INIT_LIST_HEAD(&vt_list->list); - spin_lock_init(&vt_list->lock); - vt_list->to_expire = 0; - vt_list->offset = 0; - vt_list->idle = 0; - } void __init vtime_init(void) diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c index 8a8df755296..06b71823f39 100644 --- a/drivers/s390/cio/cio.c +++ b/drivers/s390/cio/cio.c @@ -632,8 +632,8 @@ do_IRQ (struct pt_regs *regs) struct pt_regs *old_regs; old_regs = set_irq_regs(regs); - irq_enter(); s390_idle_check(); + irq_enter(); if (S390_lowcore.int_clock >= S390_lowcore.clock_comparator) /* Serve timer interrupts first. */ clock_comparator_work(); diff --git a/drivers/s390/s390mach.c b/drivers/s390/s390mach.c index 834e9ee7e93..92b0417f8e1 100644 --- a/drivers/s390/s390mach.c +++ b/drivers/s390/s390mach.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "s390mach.h" static struct semaphore m_sem; @@ -369,6 +370,8 @@ s390_do_machine_check(struct pt_regs *regs) lockdep_off(); + s390_idle_check(); + mci = (struct mci *) &S390_lowcore.mcck_interruption_code; mcck = &__get_cpu_var(cpu_mcck); umode = user_mode(regs); From c742b31c03f37c5c499178f09f57381aa6c70131 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 31 Dec 2008 15:11:42 +0100 Subject: [PATCH 6/6] [PATCH] fast vdso implementation for CLOCK_THREAD_CPUTIME_ID The extract cpu time instruction (ectg) instruction allows the user process to get the current thread cputime without calling into the kernel. The code that uses the instruction needs to switch to the access registers mode to get access to the per-cpu info page that contains the two base values that are needed to calculate the current cputime from the CPU timer with the ectg instruction. Signed-off-by: Martin Schwidefsky --- arch/s390/include/asm/lowcore.h | 9 +- arch/s390/include/asm/vdso.h | 15 ++- arch/s390/kernel/asm-offsets.c | 5 + arch/s390/kernel/entry64.S | 45 +++++---- arch/s390/kernel/head64.S | 2 + arch/s390/kernel/setup.c | 2 + arch/s390/kernel/smp.c | 9 ++ arch/s390/kernel/vdso.c | 123 +++++++++++++++++++++++- arch/s390/kernel/vdso64/clock_getres.S | 5 + arch/s390/kernel/vdso64/clock_gettime.S | 39 +++++++- 10 files changed, 222 insertions(+), 32 deletions(-) diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index a547817cf1a..ffdef5fe858 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -106,8 +106,10 @@ #define __LC_IPLDEV 0xDB8 #define __LC_CURRENT 0xDD8 #define __LC_INT_CLOCK 0xDE8 +#define __LC_VDSO_PER_CPU 0xE38 #endif /* __s390x__ */ +#define __LC_PASTE 0xE40 #define __LC_PANIC_MAGIC 0xE00 #ifndef __s390x__ @@ -381,7 +383,12 @@ struct _lowcore /* whether the kernel died with panic() or not */ __u32 panic_magic; /* 0xe00 */ - __u8 pad13[0x11b8-0xe04]; /* 0xe04 */ + /* Per cpu primary space access list */ + __u8 pad_0xe04[0xe3c-0xe04]; /* 0xe04 */ + __u32 vdso_per_cpu_data; /* 0xe3c */ + __u32 paste[16]; /* 0xe40 */ + + __u8 pad13[0x11b8-0xe80]; /* 0xe80 */ /* 64 bit extparam used for pfault, diag 250 etc */ __u64 ext_params2; /* 0x11B8 */ diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index a44f4fe16a3..7bdd7c8ebc9 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -12,9 +12,9 @@ #ifndef __ASSEMBLY__ /* - * Note about this structure: + * Note about the vdso_data and vdso_per_cpu_data structures: * - * NEVER USE THIS IN USERSPACE CODE DIRECTLY. The layout of this + * NEVER USE THEM IN USERSPACE CODE DIRECTLY. The layout of the * structure is supposed to be known only to the function in the vdso * itself and may change without notice. */ @@ -28,10 +28,21 @@ struct vdso_data { __u64 wtom_clock_nsec; /* 0x28 */ __u32 tz_minuteswest; /* Minutes west of Greenwich 0x30 */ __u32 tz_dsttime; /* Type of dst correction 0x34 */ + __u32 ectg_available; +}; + +struct vdso_per_cpu_data { + __u64 ectg_timer_base; + __u64 ectg_user_time; }; extern struct vdso_data *vdso_data; +#ifdef CONFIG_64BIT +int vdso_alloc_per_cpu(int cpu, struct _lowcore *lowcore); +void vdso_free_per_cpu(int cpu, struct _lowcore *lowcore); +#endif + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index e641f60bac9..67a60016bab 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -48,6 +48,11 @@ int main(void) DEFINE(__VDSO_WTOM_SEC, offsetof(struct vdso_data, wtom_clock_sec)); DEFINE(__VDSO_WTOM_NSEC, offsetof(struct vdso_data, wtom_clock_nsec)); DEFINE(__VDSO_TIMEZONE, offsetof(struct vdso_data, tz_minuteswest)); + DEFINE(__VDSO_ECTG_OK, offsetof(struct vdso_data, ectg_available)); + DEFINE(__VDSO_ECTG_BASE, + offsetof(struct vdso_per_cpu_data, ectg_timer_base)); + DEFINE(__VDSO_ECTG_USER, + offsetof(struct vdso_per_cpu_data, ectg_user_time)); /* constants used by the vdso */ DEFINE(CLOCK_REALTIME, CLOCK_REALTIME); DEFINE(CLOCK_MONOTONIC, CLOCK_MONOTONIC); diff --git a/arch/s390/kernel/entry64.S b/arch/s390/kernel/entry64.S index ae83c195171..c6fbde13971 100644 --- a/arch/s390/kernel/entry64.S +++ b/arch/s390/kernel/entry64.S @@ -177,8 +177,11 @@ _TIF_WORK_INT = (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED | \ .if !\sync ni \psworg+1,0xfd # clear wait state bit .endif - lmg %r0,%r15,SP_R0(%r15) # load gprs 0-15 of user + lg %r14,__LC_VDSO_PER_CPU + lmg %r0,%r13,SP_R0(%r15) # load gprs 0-13 of user stpt __LC_EXIT_TIMER + mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER + lmg %r14,%r15,SP_R14(%r15) # load grps 14-15 of user lpswe \psworg # back to caller .endm @@ -980,23 +983,23 @@ cleanup_sysc_return: cleanup_sysc_leave: clc 8(8,%r12),BASED(cleanup_sysc_leave_insn) - je 2f - mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER + je 3f clc 8(8,%r12),BASED(cleanup_sysc_leave_insn+8) - je 2f - mvc __LC_RETURN_PSW(16),SP_PSW(%r15) + jhe 0f + mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER +0: mvc __LC_RETURN_PSW(16),SP_PSW(%r15) cghi %r12,__LC_MCK_OLD_PSW - jne 0f + jne 1f mvc __LC_SAVE_AREA+64(32),SP_R12(%r15) - j 1f -0: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15) -1: lmg %r0,%r11,SP_R0(%r15) + j 2f +1: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15) +2: lmg %r0,%r11,SP_R0(%r15) lg %r15,SP_R15(%r15) -2: la %r12,__LC_RETURN_PSW +3: la %r12,__LC_RETURN_PSW br %r14 cleanup_sysc_leave_insn: .quad sysc_done - 4 - .quad sysc_done - 8 + .quad sysc_done - 16 cleanup_io_return: mvc __LC_RETURN_PSW(8),0(%r12) @@ -1006,23 +1009,23 @@ cleanup_io_return: cleanup_io_leave: clc 8(8,%r12),BASED(cleanup_io_leave_insn) - je 2f - mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER + je 3f clc 8(8,%r12),BASED(cleanup_io_leave_insn+8) - je 2f - mvc __LC_RETURN_PSW(16),SP_PSW(%r15) + jhe 0f + mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER +0: mvc __LC_RETURN_PSW(16),SP_PSW(%r15) cghi %r12,__LC_MCK_OLD_PSW - jne 0f + jne 1f mvc __LC_SAVE_AREA+64(32),SP_R12(%r15) - j 1f -0: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15) -1: lmg %r0,%r11,SP_R0(%r15) + j 2f +1: mvc __LC_SAVE_AREA+32(32),SP_R12(%r15) +2: lmg %r0,%r11,SP_R0(%r15) lg %r15,SP_R15(%r15) -2: la %r12,__LC_RETURN_PSW +3: la %r12,__LC_RETURN_PSW br %r14 cleanup_io_leave_insn: .quad io_done - 4 - .quad io_done - 8 + .quad io_done - 16 /* * Integer constants diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 3ccd36b24b8..f9f70aa1524 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -87,6 +87,8 @@ startup_continue: lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area # move IPL device to lowcore mvc __LC_IPLDEV(4),IPL_DEVICE+4-PARMAREA(%r12) + lghi %r0,__LC_PASTE + stg %r0,__LC_VDSO_PER_CPU # # Setup stack # diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index b7a1efd5522..d825f4950e4 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -427,6 +427,8 @@ setup_lowcore(void) /* enable extended save area */ __ctl_set_bit(14, 29); } +#else + lc->vdso_per_cpu_data = (unsigned long) &lc->paste[0]; #endif set_prefix((u32)(unsigned long) lc); } diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 3979a6fc088..b3461e8f170 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -47,6 +47,7 @@ #include #include #include +#include #include "entry.h" /* @@ -506,6 +507,9 @@ static int __cpuinit smp_alloc_lowcore(int cpu) goto out; lowcore->extended_save_area_addr = (u32) save_area; } +#else + if (vdso_alloc_per_cpu(cpu, lowcore)) + goto out; #endif lowcore_ptr[cpu] = lowcore; return 0; @@ -528,6 +532,8 @@ static void smp_free_lowcore(int cpu) #ifndef CONFIG_64BIT if (MACHINE_HAS_IEEE) free_page((unsigned long) lowcore->extended_save_area_addr); +#else + vdso_free_per_cpu(cpu, lowcore); #endif free_page(lowcore->panic_stack - PAGE_SIZE); free_pages(lowcore->async_stack - ASYNC_SIZE, ASYNC_ORDER); @@ -670,6 +676,7 @@ void __init smp_prepare_cpus(unsigned int max_cpus) lowcore = (void *) __get_free_pages(GFP_KERNEL | GFP_DMA, lc_order); panic_stack = __get_free_page(GFP_KERNEL); async_stack = __get_free_pages(GFP_KERNEL, ASYNC_ORDER); + BUG_ON(!lowcore || !panic_stack || !async_stack); #ifndef CONFIG_64BIT if (MACHINE_HAS_IEEE) save_area = get_zeroed_page(GFP_KERNEL); @@ -683,6 +690,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) #ifndef CONFIG_64BIT if (MACHINE_HAS_IEEE) lowcore->extended_save_area_addr = (u32) save_area; +#else + BUG_ON(vdso_alloc_per_cpu(smp_processor_id(), lowcore)); #endif set_prefix((u32)(unsigned long) lowcore); local_mcck_enable(); diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 10a6ccef441..25a6a82f1c0 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -31,9 +31,6 @@ #include #include -/* Max supported size for symbol names */ -#define MAX_SYMNAME 64 - #if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT) extern char vdso32_start, vdso32_end; static void *vdso32_kbase = &vdso32_start; @@ -70,6 +67,119 @@ static union { } vdso_data_store __attribute__((__section__(".data.page_aligned"))); struct vdso_data *vdso_data = &vdso_data_store.data; +/* + * Setup vdso data page. + */ +static void vdso_init_data(struct vdso_data *vd) +{ + unsigned int facility_list; + + facility_list = stfl(); + vd->ectg_available = switch_amode && (facility_list & 1); +} + +#ifdef CONFIG_64BIT +/* + * Setup per cpu vdso data page. + */ +static void vdso_init_per_cpu_data(int cpu, struct vdso_per_cpu_data *vpcd) +{ +} + +/* + * Allocate/free per cpu vdso data. + */ +#ifdef CONFIG_64BIT +#define SEGMENT_ORDER 2 +#else +#define SEGMENT_ORDER 1 +#endif + +int vdso_alloc_per_cpu(int cpu, struct _lowcore *lowcore) +{ + unsigned long segment_table, page_table, page_frame; + u32 *psal, *aste; + int i; + + lowcore->vdso_per_cpu_data = __LC_PASTE; + + if (!switch_amode || !vdso_enabled) + return 0; + + segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER); + page_table = get_zeroed_page(GFP_KERNEL | GFP_DMA); + page_frame = get_zeroed_page(GFP_KERNEL); + if (!segment_table || !page_table || !page_frame) + goto out; + + clear_table((unsigned long *) segment_table, _SEGMENT_ENTRY_EMPTY, + PAGE_SIZE << SEGMENT_ORDER); + clear_table((unsigned long *) page_table, _PAGE_TYPE_EMPTY, + 256*sizeof(unsigned long)); + + *(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table; + *(unsigned long *) page_table = _PAGE_RO + page_frame; + + psal = (u32 *) (page_table + 256*sizeof(unsigned long)); + aste = psal + 32; + + for (i = 4; i < 32; i += 4) + psal[i] = 0x80000000; + + lowcore->paste[4] = (u32)(addr_t) psal; + psal[0] = 0x20000000; + psal[2] = (u32)(addr_t) aste; + *(unsigned long *) (aste + 2) = segment_table + + _ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT; + aste[4] = (u32)(addr_t) psal; + lowcore->vdso_per_cpu_data = page_frame; + + vdso_init_per_cpu_data(cpu, (struct vdso_per_cpu_data *) page_frame); + return 0; + +out: + free_page(page_frame); + free_page(page_table); + free_pages(segment_table, SEGMENT_ORDER); + return -ENOMEM; +} + +#ifdef CONFIG_HOTPLUG_CPU +void vdso_free_per_cpu(int cpu, struct _lowcore *lowcore) +{ + unsigned long segment_table, page_table, page_frame; + u32 *psal, *aste; + + if (!switch_amode || !vdso_enabled) + return; + + psal = (u32 *)(addr_t) lowcore->paste[4]; + aste = (u32 *)(addr_t) psal[2]; + segment_table = *(unsigned long *)(aste + 2) & PAGE_MASK; + page_table = *(unsigned long *) segment_table; + page_frame = *(unsigned long *) page_table; + + free_page(page_frame); + free_page(page_table); + free_pages(segment_table, SEGMENT_ORDER); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static void __vdso_init_cr5(void *dummy) +{ + unsigned long cr5; + + cr5 = offsetof(struct _lowcore, paste); + __ctl_load(cr5, 5, 5); +} + +static void vdso_init_cr5(void) +{ + if (switch_amode && vdso_enabled) + on_each_cpu(__vdso_init_cr5, NULL, 1); +} +#endif /* CONFIG_64BIT */ + /* * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree @@ -172,6 +282,9 @@ static int __init vdso_init(void) { int i; + if (!vdso_enabled) + return 0; + vdso_init_data(vdso_data); #if defined(CONFIG_32BIT) || defined(CONFIG_COMPAT) /* Calculate the size of the 32 bit vDSO */ vdso32_pages = ((&vdso32_end - &vdso32_start @@ -208,6 +321,10 @@ static int __init vdso_init(void) } vdso64_pagelist[vdso64_pages - 1] = virt_to_page(vdso_data); vdso64_pagelist[vdso64_pages] = NULL; +#ifndef CONFIG_SMP + BUG_ON(vdso_alloc_per_cpu(0, S390_lowcore)); +#endif + vdso_init_cr5(); #endif /* CONFIG_64BIT */ get_page(virt_to_page(vdso_data)); diff --git a/arch/s390/kernel/vdso64/clock_getres.S b/arch/s390/kernel/vdso64/clock_getres.S index 488e31a3c0e..9ce8caafdb4 100644 --- a/arch/s390/kernel/vdso64/clock_getres.S +++ b/arch/s390/kernel/vdso64/clock_getres.S @@ -22,7 +22,12 @@ __kernel_clock_getres: cghi %r2,CLOCK_REALTIME je 0f cghi %r2,CLOCK_MONOTONIC + je 0f + cghi %r2,-2 /* CLOCK_THREAD_CPUTIME_ID for this thread */ jne 2f + larl %r5,_vdso_data + icm %r0,15,__LC_ECTG_OK(%r5) + jz 2f 0: ltgr %r3,%r3 jz 1f /* res == NULL */ larl %r1,3f diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S index 738a410b7eb..79dbfee831e 100644 --- a/arch/s390/kernel/vdso64/clock_gettime.S +++ b/arch/s390/kernel/vdso64/clock_gettime.S @@ -22,8 +22,10 @@ __kernel_clock_gettime: larl %r5,_vdso_data cghi %r2,CLOCK_REALTIME je 4f + cghi %r2,-2 /* CLOCK_THREAD_CPUTIME_ID for this thread */ + je 9f cghi %r2,CLOCK_MONOTONIC - jne 9f + jne 12f /* CLOCK_MONOTONIC */ ltgr %r3,%r3 @@ -42,7 +44,7 @@ __kernel_clock_gettime: alg %r0,__VDSO_WTOM_SEC(%r5) clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ jne 0b - larl %r5,10f + larl %r5,13f 1: clg %r1,0(%r5) jl 2f slg %r1,0(%r5) @@ -68,7 +70,7 @@ __kernel_clock_gettime: lg %r0,__VDSO_XTIME_SEC(%r5) clg %r4,__VDSO_UPD_COUNT(%r5) /* check update counter */ jne 5b - larl %r5,10f + larl %r5,13f 6: clg %r1,0(%r5) jl 7f slg %r1,0(%r5) @@ -79,11 +81,38 @@ __kernel_clock_gettime: 8: lghi %r2,0 br %r14 + /* CLOCK_THREAD_CPUTIME_ID for this thread */ +9: icm %r0,15,__VDSO_ECTG_OK(%r5) + jz 12f + ear %r2,%a4 + llilh %r4,0x0100 + sar %a4,%r4 + lghi %r4,0 + sacf 512 /* Magic ectg instruction */ + .insn ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4 + sacf 0 + sar %a4,%r2 + algr %r1,%r0 /* r1 = cputime as TOD value */ + mghi %r1,1000 /* convert to nanoseconds */ + srlg %r1,%r1,12 /* r1 = cputime in nanosec */ + lgr %r4,%r1 + larl %r5,13f + srlg %r1,%r1,9 /* divide by 1000000000 */ + mlg %r0,8(%r5) + srlg %r0,%r0,11 /* r0 = tv_sec */ + stg %r0,0(%r3) + msg %r0,0(%r5) /* calculate tv_nsec */ + slgr %r4,%r0 /* r4 = tv_nsec */ + stg %r4,8(%r3) + lghi %r2,0 + br %r14 + /* Fallback to system call */ -9: lghi %r1,__NR_clock_gettime +12: lghi %r1,__NR_clock_gettime svc 0 br %r14 -10: .quad 1000000000 +13: .quad 1000000000 +14: .quad 19342813113834067 .cfi_endproc .size __kernel_clock_gettime,.-__kernel_clock_gettime