diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 3a1dbba4d3a..20a100fe2b4 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -143,3 +143,12 @@ extern void clockevents_notify(unsigned long reason, void *arg); #endif #endif + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +extern ktime_t clockevents_get_next_event(int cpu); +#else +static inline ktime_t clockevents_get_next_event(int cpu) +{ + return (ktime_t) { .tv64 = KTIME_MAX }; +} +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 61850401040..311dec12397 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -257,6 +257,7 @@ extern void task_rq_unlock_wait(struct task_struct *p); extern cpumask_var_t nohz_cpu_mask; #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ) extern int select_nohz_load_balancer(int cpu); +extern int get_nohz_load_balancer(void); #else static inline int select_nohz_load_balancer(int cpu) { @@ -1772,6 +1773,17 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_DEBUG +static inline unsigned int get_sysctl_timer_migration(void) +{ + return sysctl_timer_migration; +} +#else +static inline unsigned int get_sysctl_timer_migration(void) +{ + return 1; +} +#endif extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index c71bcd54924..b675a67c9ac 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -43,6 +43,8 @@ #include #include #include +#include +#include #include @@ -198,8 +200,19 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, { struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; + int cpu, preferred_cpu = -1; - new_cpu_base = &__get_cpu_var(hrtimer_bases); + cpu = smp_processor_id(); +#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { + preferred_cpu = get_nohz_load_balancer(); + if (preferred_cpu >= 0) + cpu = preferred_cpu; + } +#endif + +again: + new_cpu_base = &per_cpu(hrtimer_bases, cpu); new_base = &new_cpu_base->clock_base[base->index]; if (base != new_base) { @@ -219,6 +232,40 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, timer->base = NULL; spin_unlock(&base->cpu_base->lock); spin_lock(&new_base->cpu_base->lock); + + /* Optimized away for NOHZ=n SMP=n */ + if (cpu == preferred_cpu) { + /* Calculate clock monotonic expiry time */ +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), + new_base->offset); +#else + ktime_t expires = hrtimer_get_expires(timer); +#endif + + /* + * Get the next event on target cpu from the + * clock events layer. + * This covers the highres=off nohz=on case as well. + */ + ktime_t next = clockevents_get_next_event(cpu); + + ktime_t delta = ktime_sub(expires, next); + + /* + * We do not migrate the timer when it is expiring + * before the next event on the target cpu because + * we cannot reprogram the target cpu hardware and + * we would cause it to fire late. + */ + if (delta.tv64 < 0) { + cpu = smp_processor_id(); + spin_unlock(&new_base->cpu_base->lock); + spin_lock(&base->cpu_base->lock); + timer->base = base; + goto again; + } + } timer->base = new_base; } return new_base; @@ -236,7 +283,7 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) return base; } -# define switch_hrtimer_base(t, b) (b) +# define switch_hrtimer_base(t, b, p) (b) #endif /* !CONFIG_SMP */ diff --git a/kernel/sched.c b/kernel/sched.c index 7f1dd56af86..9fe3774a0fd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4244,6 +4244,11 @@ static struct { .load_balancer = ATOMIC_INIT(-1), }; +int get_nohz_load_balancer(void) +{ + return atomic_read(&nohz.load_balancer); +} + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index d13be216a79..ab20ded013b 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -18,6 +18,7 @@ #include #include #include +#include /* The registered clock event devices */ static LIST_HEAD(clockevent_devices); @@ -251,4 +252,15 @@ void clockevents_notify(unsigned long reason, void *arg) spin_unlock(&clockevents_lock); } EXPORT_SYMBOL_GPL(clockevents_notify); + +ktime_t clockevents_get_next_event(int cpu) +{ + struct tick_device *td; + struct clock_event_device *dev; + + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + + return dev->next_event; +} #endif diff --git a/kernel/timer.c b/kernel/timer.c index 3424dfd11d5..3f841db5edf 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -609,9 +610,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, { struct tvec_base *base, *new_base; unsigned long flags; - int ret; - - ret = 0; + int ret = 0 , cpu; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -630,6 +629,18 @@ __mod_timer(struct timer_list *timer, unsigned long expires, new_base = __get_cpu_var(tvec_bases); + cpu = smp_processor_id(); + +#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { + int preferred_cpu = get_nohz_load_balancer(); + + if (preferred_cpu >= 0) + cpu = preferred_cpu; + } +#endif + new_base = per_cpu(tvec_bases, cpu); + if (base != new_base) { /* * We are trying to schedule the timer on the local CPU.