aha/kernel/sched_clock.c
Steven Rostedt c300ba2528 sched_clock: and multiplier for TSC to gtod drift
The sched_clock code currently tries to keep all CPU clocks of all CPUS
somewhat in sync. At every clock tick it records the gtod clock and
uses that and jiffies and the TSC to calculate a CPU clock that tries to
stay in sync with all the other CPUs.

ftrace depends heavily on this timer and it detects when this timer
"jumps".  One problem is that the TSC and the gtod also drift.
When the TSC is 0.1% faster or slower than the gtod it is very noticeable
in ftrace. To help compensate for this, I've added a multiplier that
tries to keep the CPU clock updating at the same rate as the gtod.

I've tried various ways to get it to be in sync and this ended up being
the most reliable. At every scheduler tick we calculate the new multiplier:

  multi = delta_gtod / delta_TSC

This means we perform a 64 bit divide at the tick (once a HZ). A shift
is used to handle the accuracy.

Other methods that failed due to dynamic HZ are:

(not used)  multi += (gtod - tsc) / delta_gtod
(not used)  multi += (gtod - (last_tsc + delta_tsc)) / delta_gtod

as well as other variants.

This code still allows for a slight drift between TSC and gtod, but
it keeps the damage down to a minimum.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: john stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-11 15:53:28 +02:00

332 lines
7.5 KiB
C

/*
* sched_clock for unstable cpu clocks
*
* Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
*
* Updates and enhancements:
* Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
*
* Based on code by:
* Ingo Molnar <mingo@redhat.com>
* Guillaume Chazarain <guichaz@gmail.com>
*
* Create a semi stable clock from a mixture of other events, including:
* - gtod
* - jiffies
* - sched_clock()
* - explicit idle events
*
* We use gtod as base and the unstable clock deltas. The deltas are filtered,
* making it monotonic and keeping it within an expected window. This window
* is set up using jiffies.
*
* Furthermore, explicit sleep and wakeup hooks allow us to account for time
* that is otherwise invisible (TSC gets stopped).
*
* The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
* consistent between cpus (never more than 1 jiffies difference).
*/
#include <linux/sched.h>
#include <linux/percpu.h>
#include <linux/spinlock.h>
#include <linux/ktime.h>
#include <linux/module.h>
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
#define MULTI_SHIFT 15
/* Max is double, Min is 1/2 */
#define MAX_MULTI (2LL << MULTI_SHIFT)
#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
struct sched_clock_data {
/*
* Raw spinlock - this is a special case: this might be called
* from within instrumentation code so we dont want to do any
* instrumentation ourselves.
*/
raw_spinlock_t lock;
unsigned long tick_jiffies;
u64 prev_raw;
u64 tick_raw;
u64 tick_gtod;
u64 clock;
s64 multi;
#ifdef CONFIG_NO_HZ
int check_max;
#endif
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
static inline struct sched_clock_data *this_scd(void)
{
return &__get_cpu_var(sched_clock_data);
}
static inline struct sched_clock_data *cpu_sdc(int cpu)
{
return &per_cpu(sched_clock_data, cpu);
}
static __read_mostly int sched_clock_running;
void sched_clock_init(void)
{
u64 ktime_now = ktime_to_ns(ktime_get());
unsigned long now_jiffies = jiffies;
int cpu;
for_each_possible_cpu(cpu) {
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
scd->tick_jiffies = now_jiffies;
scd->prev_raw = 0;
scd->tick_raw = 0;
scd->tick_gtod = ktime_now;
scd->clock = ktime_now;
scd->multi = 1 << MULTI_SHIFT;
#ifdef CONFIG_NO_HZ
scd->check_max = 1;
#endif
}
sched_clock_running = 1;
}
#ifdef CONFIG_NO_HZ
/*
* The dynamic ticks makes the delta jiffies inaccurate. This
* prevents us from checking the maximum time update.
* Disable the maximum check during stopped ticks.
*/
void sched_clock_tick_stop(int cpu)
{
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->check_max = 0;
}
void sched_clock_tick_start(int cpu)
{
struct sched_clock_data *scd = cpu_sdc(cpu);
scd->check_max = 1;
}
static int check_max(struct sched_clock_data *scd)
{
return scd->check_max;
}
#else
static int check_max(struct sched_clock_data *scd)
{
return 1;
}
#endif /* CONFIG_NO_HZ */
/*
* update the percpu scd from the raw @now value
*
* - filter out backward motion
* - use jiffies to generate a min,max window to clip the raw values
*/
static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
{
unsigned long now_jiffies = jiffies;
long delta_jiffies = now_jiffies - scd->tick_jiffies;
u64 clock = scd->clock;
u64 min_clock, max_clock;
s64 delta = now - scd->prev_raw;
WARN_ON_ONCE(!irqs_disabled());
/*
* At schedule tick the clock can be just under the gtod. We don't
* want to push it too prematurely.
*/
min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
if (min_clock > TICK_NSEC)
min_clock -= TICK_NSEC / 2;
if (unlikely(delta < 0)) {
clock++;
goto out;
}
/*
* The clock must stay within a jiffie of the gtod.
* But since we may be at the start of a jiffy or the end of one
* we add another jiffy buffer.
*/
max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
delta *= scd->multi;
delta >>= MULTI_SHIFT;
if (unlikely(clock + delta > max_clock) && check_max(scd)) {
if (clock < max_clock)
clock = max_clock;
else
clock++;
} else {
clock += delta;
}
out:
if (unlikely(clock < min_clock))
clock = min_clock;
if (time)
*time = clock;
else {
scd->prev_raw = now;
scd->clock = clock;
}
}
static void lock_double_clock(struct sched_clock_data *data1,
struct sched_clock_data *data2)
{
if (data1 < data2) {
__raw_spin_lock(&data1->lock);
__raw_spin_lock(&data2->lock);
} else {
__raw_spin_lock(&data2->lock);
__raw_spin_lock(&data1->lock);
}
}
u64 sched_clock_cpu(int cpu)
{
struct sched_clock_data *scd = cpu_sdc(cpu);
u64 now, clock;
if (unlikely(!sched_clock_running))
return 0ull;
WARN_ON_ONCE(!irqs_disabled());
now = sched_clock();
if (cpu != raw_smp_processor_id()) {
/*
* in order to update a remote cpu's clock based on our
* unstable raw time rebase it against:
* tick_raw (offset between raw counters)
* tick_gotd (tick offset between cpus)
*/
struct sched_clock_data *my_scd = this_scd();
lock_double_clock(scd, my_scd);
now -= my_scd->tick_raw;
now += scd->tick_raw;
now += my_scd->tick_gtod;
now -= scd->tick_gtod;
__raw_spin_unlock(&my_scd->lock);
__update_sched_clock(scd, now, &clock);
__raw_spin_unlock(&scd->lock);
} else {
__raw_spin_lock(&scd->lock);
__update_sched_clock(scd, now, NULL);
clock = scd->clock;
__raw_spin_unlock(&scd->lock);
}
return clock;
}
void sched_clock_tick(void)
{
struct sched_clock_data *scd = this_scd();
unsigned long now_jiffies = jiffies;
s64 mult, delta_gtod, delta_raw;
u64 now, now_gtod;
if (unlikely(!sched_clock_running))
return;
WARN_ON_ONCE(!irqs_disabled());
now_gtod = ktime_to_ns(ktime_get());
now = sched_clock();
__raw_spin_lock(&scd->lock);
__update_sched_clock(scd, now, NULL);
/*
* update tick_gtod after __update_sched_clock() because that will
* already observe 1 new jiffy; adding a new tick_gtod to that would
* increase the clock 2 jiffies.
*/
delta_gtod = now_gtod - scd->tick_gtod;
delta_raw = now - scd->tick_raw;
if ((long)delta_raw > 0) {
mult = delta_gtod << MULTI_SHIFT;
do_div(mult, delta_raw);
scd->multi = mult;
if (scd->multi > MAX_MULTI)
scd->multi = MAX_MULTI;
else if (scd->multi < MIN_MULTI)
scd->multi = MIN_MULTI;
} else
scd->multi = 1 << MULTI_SHIFT;
scd->tick_raw = now;
scd->tick_gtod = now_gtod;
scd->tick_jiffies = now_jiffies;
__raw_spin_unlock(&scd->lock);
}
/*
* We are going deep-idle (irqs are disabled):
*/
void sched_clock_idle_sleep_event(void)
{
sched_clock_cpu(smp_processor_id());
}
EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
/*
* We just idled delta nanoseconds (called with irqs disabled):
*/
void sched_clock_idle_wakeup_event(u64 delta_ns)
{
struct sched_clock_data *scd = this_scd();
u64 now = sched_clock();
/*
* Override the previous timestamp and ignore all
* sched_clock() deltas that occured while we idled,
* and use the PM-provided delta_ns to advance the
* rq clock:
*/
__raw_spin_lock(&scd->lock);
scd->prev_raw = now;
scd->clock += delta_ns;
scd->multi = 1 << MULTI_SHIFT;
__raw_spin_unlock(&scd->lock);
touch_softlockup_watchdog();
}
EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
#endif
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
unsigned long long __attribute__((weak)) sched_clock(void)
{
return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
}